diff --git a/pom.xml b/pom.xml index 67e2d1b78..d573dcc6d 100644 --- a/pom.xml +++ b/pom.xml @@ -32,6 +32,7 @@ false -build-timestamp "${maven.build.timestamp}" + ${gatk.basedir}/public/src/main/scripts/shell - gatk-tools-public + gatk-utils ${project.version} 2g @@ -624,6 +624,21 @@ org.codehaus.mojo exec-maven-plugin + + + check-utils-engine-tools + + exec + + process-sources + false + + ${gatk.shell.directory}/check_utils_engine_tools.sh + + false - org.broadinstitute.gatk.utils.help.GATKDoclet + org.broadinstitute.gatk.tools.walkers.help.WalkerDoclet ${project.groupId} gatk-package-distribution diff --git a/protected/gatk-package-distribution/pom.xml b/protected/gatk-package-distribution/pom.xml index ad48250bb..2fd3c3b8f 100644 --- a/protected/gatk-package-distribution/pom.xml +++ b/protected/gatk-package-distribution/pom.xml @@ -43,6 +43,11 @@ gatk-tools-protected ${project.version} + + + org.slf4j + slf4j-log4j12 + samtools @@ -73,7 +78,7 @@ ${project.groupId} - gatk-engine + gatk-utils ${project.version} example-resources tar.bz2 diff --git a/protected/gatk-queue-extensions-distribution/pom.xml b/protected/gatk-queue-extensions-distribution/pom.xml index 7c4035b04..44018ec4e 100644 --- a/protected/gatk-queue-extensions-distribution/pom.xml +++ b/protected/gatk-queue-extensions-distribution/pom.xml @@ -41,6 +41,10 @@ log4j log4j + + picard + picard + ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} test-jar test diff --git a/protected/gatk-queue-package-distribution/pom.xml b/protected/gatk-queue-package-distribution/pom.xml index f3de9c0d9..5a077ef29 100644 --- a/protected/gatk-queue-package-distribution/pom.xml +++ b/protected/gatk-queue-package-distribution/pom.xml @@ -80,7 +80,7 @@ ${project.groupId} - gatk-engine + gatk-utils ${project.version} example-resources tar.bz2 diff --git a/protected/gatk-tools-protected/pom.xml b/protected/gatk-tools-protected/pom.xml index 601939a92..24ceffe87 100644 --- a/protected/gatk-tools-protected/pom.xml +++ b/protected/gatk-tools-protected/pom.xml @@ -48,7 +48,15 @@ ${project.groupId} - gatk-tools-public + gatk-utils + ${project.version} + test-jar + test + + + + ${project.groupId} + gatk-engine ${project.version} test-jar test diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java deleted file mode 100644 index bf52849d6..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardCallerArgumentCollection.java +++ /dev/null @@ -1,232 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.arguments; - -import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorImplementation; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypingOutputMode; -import org.broadinstitute.gatk.tools.walkers.genotyper.OutputMode; -import org.broadinstitute.gatk.utils.collections.DefaultHashMap; -import htsjdk.variant.variantcontext.VariantContext; - -import java.io.File; -import java.lang.reflect.Field; -import java.lang.reflect.Method; -import java.lang.reflect.Modifier; -import java.util.Collections; -import java.util.Map; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 8/20/12 - * A collection of arguments that are common to the various callers. - * This is pulled out so that every caller isn't exposed to the arguments from every other caller. - */ - -public class StandardCallerArgumentCollection implements Cloneable { - - @ArgumentCollection - public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection(); - - @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) - public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY; - - /** - * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding - */ - @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) - public RodBinding alleles; - - /** - * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. - * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we - * will try to remove (N * contamination fraction) bases for each alternate allele. - */ - @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) - public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION; - public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0; - - /** - * This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples. - * Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION. - **/ - @Advanced - @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"\" (Contamination is double) per line; No header.", required = false) - public File CONTAMINATION_FRACTION_FILE = null; - - /** - * Indicates whether there is some sample contamination present. - */ - private boolean sampleContaminationWasLoaded = false; - - /** - * - * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION - */ - public Map getSampleContamination(){ - //make sure that the default value is set up right - sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); - if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) - sampleContaminationWasLoaded = true; - return Collections.unmodifiableMap(sampleContamination); - } - - public void setSampleContamination(DefaultHashMap sampleContamination) { - this.sampleContamination.clear(); - this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0; - if (!sampleContaminationWasLoaded) - for (final Double d : sampleContamination.values()) - if (!Double.isNaN(d) && d > 0.0) { - sampleContaminationWasLoaded = true; - break; - } - this.sampleContamination.putAll(sampleContamination); - this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); - } - - /** - * Returns true if there is some sample contamination present, false otherwise. - * @return {@code true} iff there is some sample contamination - */ - public boolean isSampleContaminationPresent() { - return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded; - } - - //Needs to be here because it uses CONTAMINATION_FRACTION - private DefaultHashMap sampleContamination = new DefaultHashMap(CONTAMINATION_FRACTION); - - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Hidden - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel; - - @Hidden - @Argument(shortName = "logExactCalls", doc="x", required=false) - public File exactCallsLog = null; - - @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) - public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY; - - /** - * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites. - * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any). - * WARNINGS: - * - This feature will inflate VCF file size considerably. - * - All SNP ALT alleles will be emitted with corresponding 10 PL values. - * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used - */ - @Advanced - @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false) - public boolean annotateAllSitesWithPLs = false; - - /** - * Creates a Standard caller argument collection with default values. - */ - public StandardCallerArgumentCollection() { } - - /** - * "Casts" a caller argument collection into another type. - * - *

Common fields values are copied across

- * @param clazz the class of the result. - * @param result argument collection class. - * @return never {@code null}. - */ - public T cloneTo(final Class clazz) { - // short cut: just use regular clone if it happens to be the same class. - if (clazz == getClass()) - return (T) clone(); - try { - final T result = clazz.newInstance(); - for (final Field field : getClass().getFields()) { - // just copy common fields. - if (!field.getDeclaringClass().isAssignableFrom(clazz)) - continue; - final int fieldModifiers = field.getModifiers(); - if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue; - //Use the clone() method if appropriate - if (Cloneable.class.isAssignableFrom(field.getType())) { - Method clone = field.getType().getMethod("clone"); - field.set(result, clone.invoke(field.get(this))); - } else - field.set(result,field.get(this)); - } - return result; - } catch (final Exception ex) { - throw new IllegalStateException(ex); - } - } - - /** - * Creates a copy of this configuration. - * @return never {@code null}. - */ - @Override - public StandardCallerArgumentCollection clone() { - try { - StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone(); - cloned.genotypeArgs = genotypeArgs.clone(); - return cloned; - } catch (CloneNotSupportedException e) { - throw new IllegalStateException("unreachable code"); - } - } - - /** - * Holds a modifiers mask that identifies those fields that cannot be copied between - * StandardCallerArgumentCollections. - */ - private final int UNCOPYABLE_MODIFIER_MASK = Modifier.PRIVATE | Modifier.STATIC | Modifier.FINAL; -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java new file mode 100644 index 000000000..9ad2282ea --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRGatherer.java @@ -0,0 +1,138 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.apache.commons.collections.CollectionUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.commandline.Gatherer; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.*; + +/** + * User: carneiro + * Date: 3/29/11 + */ + + +public class BQSRGatherer extends Gatherer { + + private static final Logger logger = Logger.getLogger(BQSRGatherer.class); + private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; + private static final String MISSING_OUTPUT_FILE = "missing output file name"; + private static final String MISSING_READ_GROUPS = "Missing read group(s)"; + + @Override + public void gather(final List inputs, final File output) { + final PrintStream outputFile; + try { + outputFile = new PrintStream(output); + } catch(FileNotFoundException e) { + throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); + } + final GATKReport report = gatherReport(inputs); + report.print(outputFile); + } + + /** + * Gathers the input recalibration reports into a single report. + * + * @param inputs Input recalibration GATK reports + * @return gathered recalibration GATK report + */ + public static GATKReport gatherReport(final List inputs) { + final SortedSet allReadGroups = new TreeSet(); + final LinkedHashMap> inputReadGroups = new LinkedHashMap>(); + + // Get the read groups from each input report + for (final File input : inputs) { + final Set readGroups = RecalibrationReport.getReadGroups(input); + inputReadGroups.put(input, readGroups); + allReadGroups.addAll(readGroups); + } + + // Log the read groups that are missing from specific inputs + for (Map.Entry> entry: inputReadGroups.entrySet()) { + final File input = entry.getKey(); + final Set readGroups = entry.getValue(); + if (allReadGroups.size() != readGroups.size()) { + // Since this is not completely unexpected, more than debug, but less than a proper warning. + logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath()); + for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) { + logger.info(" " + readGroup); + } + } + } + + RecalibrationReport generalReport = null; + for (File input : inputs) { + final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups); + if( inputReport.isEmpty() ) { continue; } + + if (generalReport == null) + generalReport = inputReport; + else + generalReport.combine(inputReport); + } + if (generalReport == null) + throw new ReviewedGATKException(EMPTY_INPUT_LIST); + + generalReport.calculateQuantizedQualities(); + + return generalReport.createGATKReport(); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java new file mode 100644 index 000000000..b524ad08a --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRReadTransformer.java @@ -0,0 +1,104 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * A ReadTransformer that applies BQSR on the fly to reads + * + * User: rpoplin + * Date: 2/13/12 + */ +public class BQSRReadTransformer extends ReadTransformer { + private boolean enabled; + private BaseRecalibration bqsr = null; + + @Override + public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + this.enabled = engine.hasBQSRArgumentSet(); + if ( enabled ) { + // TODO -- See important note below about applying BQSR to a reduced BAM file: + // If it is important to make sure that BQSR is not applied (as opposed to having the covariates computed) against a reduced bam file, + // we need to figure out how to make this work. The problem is that the ReadTransformers are initialized before the ReadDataSource + // inside the GenomeAnalysisEngine, so we generate a NPE when trying to retrieve the SAMFileHeaders. Ultimately, I don't think this is + // a necessary check anyways since we disallow running BaseRecalibrator on reduced bams (so we can't generate the recal tables to use here). + // Although we could add this check to the apply() method below, it's kind of ugly and inefficient. + // The call here would be: RecalUtils.checkForInvalidRecalBams(engine.getSAMFileHeaders(), engine.getArguments().ALLOW_BQSR_ON_REDUCED_BAMS); + final BQSRArgumentSet args = engine.getBQSRArgumentSet(); + this.bqsr = new BaseRecalibration(args.getRecalFile(), args.getQuantizationLevels(), args.shouldDisableIndelQuals(), args.getPreserveQscoresLessThan(), args.shouldEmitOriginalQuals(), args.getGlobalQScorePrior()); + } + final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return enabled; + } + + /** + * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. + */ + @Override + public GATKSAMRecord apply(GATKSAMRecord read) { + bqsr.recalibrateRead(read); + return read; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java new file mode 100644 index 000000000..9095f695e --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/BaseRecalibration.java @@ -0,0 +1,208 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.SAMTag; +import htsjdk.samtools.SAMUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; + +/** + * Utility methods to facilitate on-the-fly base quality score recalibration. + * + * User: carneiro and rpoplin + * Date: 2/4/12 + */ + +public class BaseRecalibration { + private static Logger logger = Logger.getLogger(BaseRecalibration.class); + private final static boolean TEST_CACHING = false; + + private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) + private final RecalibrationTables recalibrationTables; + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + + private final boolean disableIndelQuals; + private final int preserveQLessThan; + private final double globalQScorePrior; + private final boolean emitOriginalQuals; + + /** + * Constructor using a GATK Report file + * + * @param RECAL_FILE a GATK Report file containing the recalibration information + * @param quantizationLevels number of bins to quantize the quality scores + * @param disableIndelQuals if true, do not emit base indel qualities + * @param preserveQLessThan preserve quality scores less than this value + */ + public BaseRecalibration(final File RECAL_FILE, final int quantizationLevels, final boolean disableIndelQuals, final int preserveQLessThan, final boolean emitOriginalQuals, final double globalQScorePrior) { + RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); + + recalibrationTables = recalibrationReport.getRecalibrationTables(); + requestedCovariates = recalibrationReport.getRequestedCovariates(); + quantizationInfo = recalibrationReport.getQuantizationInfo(); + if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores + quantizationInfo.noQuantization(); + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. + quantizationInfo.quantizeQualityScores(quantizationLevels); + + this.disableIndelQuals = disableIndelQuals; + this.preserveQLessThan = preserveQLessThan; + this.globalQScorePrior = globalQScorePrior; + this.emitOriginalQuals = emitOriginalQuals; + } + + /** + * Recalibrates the base qualities of a read + * + * It updates the base qualities of the read with the new recalibrated qualities (for all event types) + * + * Implements a serial recalibration of the reads using the combinational table. + * First, we perform a positional recalibration, and then a subsequent dinuc correction. + * + * Given the full recalibration table, we perform the following preprocessing steps: + * + * - calculate the global quality score shift across all data [DeltaQ] + * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift + * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual + * - The final shift equation is: + * + * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) + * + * @param read the read to recalibrate + */ + public void recalibrateRead(final GATKSAMRecord read) { + if (emitOriginalQuals && read.getAttribute(SAMTag.OQ.name()) == null) { // Save the old qualities if the tag isn't already taken in the read + try { + read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities())); + } catch (IllegalArgumentException e) { + throw new UserException.MalformedBAM(read, "illegal base quality encountered; " + e.getMessage()); + } + } + + final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, requestedCovariates); + final int readLength = read.getReadLength(); + + for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings + if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) { + read.setBaseQualities(null, errorModel); + continue; + } + + final byte[] quals = read.getBaseQualities(errorModel); + + // get the keyset for this base using the error model + final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); + + // the rg key is constant over the whole read, the global deltaQ is too + final int rgKey = fullReadKeySet[0][0]; + final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); + + if( empiricalQualRG != null ) { + final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); + + for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read + final byte origQual = quals[offset]; + + // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + if ( origQual >= preserveQLessThan ) { + // get the keyset for this base using the error model + final int[] keySet = fullReadKeySet[offset]; + final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal()); + final List empiricalQualCovs = new ArrayList(); + for (int i = 2; i < requestedCovariates.length; i++) { + if (keySet[i] < 0) { + continue; + } + empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal())); + } + + double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs ); + + // recalibrated quality is bound between 1 and MAX_QUAL + final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), RecalDatum.MAX_RECALIBRATED_Q_SCORE); + + // return the quantized version of the recalibrated quality + final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual); + + quals[offset] = recalibratedQualityScore; + } + } + } + + // finally update the base qualities in the read + read.setBaseQualities(quals, errorModel); + } + } + + @Ensures("result > 0.0") + protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { + final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon ); + final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) ); + double deltaQCovariates = 0.0; + for( final RecalDatum empiricalQualCov : empiricalQualCovs ) { + deltaQCovariates += ( empiricalQualCov == null ? 0.0 : empiricalQualCov.getEmpiricalQuality(deltaQReported + globalDeltaQ + epsilon) - (deltaQReported + globalDeltaQ + epsilon) ); + } + + return epsilon + globalDeltaQ + deltaQReported + deltaQCovariates; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java new file mode 100644 index 000000000..b01359fca --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizer.java @@ -0,0 +1,500 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.PrintStream; +import java.util.*; + +/** + * A general algorithm for quantizing quality score distributions to use a specific number of levels + * + * Takes a histogram of quality scores and a desired number of levels and produces a + * map from original quality scores -> quantized quality scores. + * + * Note that this data structure is fairly heavy-weight, holding lots of debugging and + * calculation information. If you want to use it efficiently at scale with lots of + * read groups the right way to do this: + * + * Map> map + * for each read group rg: + * hist = getQualHist(rg) + * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual) + * map.set(rg, qq.getOriginalToQuantizedMap()) + * + * This map would then be used to look up the appropriate original -> quantized + * quals for each read as it comes in. + * + * @author Mark Depristo + * @since 3/2/12 + */ +public class QualQuantizer { + final private static Set MY_EMPTY_SET = Collections.emptySet(); + + private static Logger logger = Logger.getLogger(QualQuantizer.class); + + /** + * Inputs to the QualQuantizer + */ + final int nLevels, minInterestingQual; + final List nObservationsPerQual; + + /** + * Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28). + * + * Has the same range as nObservationsPerQual + */ + final List originalToQuantizedMap; + + /** Sorted set of qual intervals. + * + * After quantize() this data structure contains only the top-level qual intervals + */ + final TreeSet quantizedIntervals; + + /** + * Protected creator for testng use only + */ + protected QualQuantizer(final int minInterestingQual) { + this.nObservationsPerQual = Collections.emptyList(); + this.nLevels = 0; + this.minInterestingQual = minInterestingQual; + this.quantizedIntervals = null; + this.originalToQuantizedMap = null; + } + + /** + * Creates a QualQuantizer for the histogram that has nLevels + * + * Note this is the only interface to the system. After creating this object + * the map can be obtained via getOriginalToQuantizedMap() + * + * @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that + * this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the + * way up to the largest quality score possible in the reads. OK if the histogram includes many 0 + * count bins, as these are quantized for free. + * @param nLevels the desired number of distinct quality scores to represent the full original range. Must + * be at least 1. + * @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely + * merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and + * quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing + * all data with Q0-Q10. + */ + public QualQuantizer(final List nObservationsPerQual, final int nLevels, final int minInterestingQual) { + this.nObservationsPerQual = nObservationsPerQual; + this.nLevels = nLevels; + this.minInterestingQual = minInterestingQual; + + // some sanity checking + if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedGATKException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual)); + if ( nLevels < 0 ) throw new ReviewedGATKException("nLevels must be >= 0"); + if ( minInterestingQual < 0 ) throw new ReviewedGATKException("minInterestingQual must be >= 0"); + + // actually run the quantizer + this.quantizedIntervals = quantize(); + + // store the map + this.originalToQuantizedMap = intervalsToMap(quantizedIntervals); + } + + /** + * Represents an contiguous interval of quality scores. + * + * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 + */ + @Invariant({ + "qStart <= qEnd", + "qStart >= 0", + "qEnd <= 1000", + "nObservations >= 0", + "nErrors >= 0", + "nErrors <= nObservations", + "fixedQual >= -1 && fixedQual <= QualityUtils.MAX_SAM_QUAL_SCORE", + "mergeOrder >= 0"}) + protected final class QualInterval implements Comparable { + final int qStart, qEnd, fixedQual, level; + final long nObservations, nErrors; + final Set subIntervals; + + /** for debugging / visualization. When was this interval created? */ + int mergeOrder; + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) { + this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET); + } + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set subIntervals) { + this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals); + } + + protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) { + this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET); + } + + @Requires("level >= 0") + public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set subIntervals) { + this.qStart = qStart; + this.qEnd = qEnd; + this.nObservations = nObservations; + this.nErrors = nErrors; + this.fixedQual = fixedQual; + this.level = level; + this.mergeOrder = 0; + this.subIntervals = Collections.unmodifiableSet(subIntervals); + } + + /** + * @return Human readable name of this interval: e.g., 10-12 + */ + public String getName() { + return qStart + "-" + qEnd; + } + + @Override + public String toString() { + return "QQ:" + getName(); + } + + /** + * @return the error rate (in real space) of this interval, or 0 if there are no observations + */ + @Ensures("result >= 0.0") + public double getErrorRate() { + if ( hasFixedQual() ) + return QualityUtils.qualToErrorProb((byte)fixedQual); + else if ( nObservations == 0 ) + return 0.0; + else + return (nErrors+1) / (1.0 * (nObservations+1)); + } + + /** + * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. + */ + @Ensures("result >= 0 && result <= QualityUtils.MAX_SAM_QUAL_SCORE") + public byte getQual() { + if ( ! hasFixedQual() ) + return QualityUtils.errorProbToQual(getErrorRate()); + else + return (byte)fixedQual; + } + + /** + * @return true if this bin is using a fixed qual + */ + public boolean hasFixedQual() { + return fixedQual != -1; + } + + @Override + public int compareTo(final QualInterval qualInterval) { + return Integer.valueOf(this.qStart).compareTo(qualInterval.qStart); + } + + /** + * Create a interval representing the merge of this interval and toMerge + * + * Errors and observations are combined + * Subintervals updated in order of left to right (determined by qStart) + * Level is 1 + highest level of this and toMerge + * Order must be updated elsewhere + * + * @param toMerge + * @return newly created merged QualInterval + */ + @Requires({"toMerge != null"}) + @Ensures({ + "result != null", + "result.nObservations >= this.nObservations", + "result.nObservations >= toMerge.nObservations", + "result.nErrors >= this.nErrors", + "result.nErrors >= toMerge.nErrors", + "result.qStart == Math.min(this.qStart, toMerge.qStart)", + "result.qEnd == Math.max(this.qEnd, toMerge.qEnd)", + "result.level > Math.max(this.level, toMerge.level)", + "result.subIntervals.size() == 2" + }) + public QualInterval merge(final QualInterval toMerge) { + final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge; + final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; + + if ( left.qEnd + 1 != right.qStart ) + throw new ReviewedGATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); + + final long nCombinedObs = left.nObservations + right.nObservations; + final long nCombinedErr = left.nErrors + right.nErrors; + + final int level = Math.max(left.level, right.level) + 1; + final Set subIntervals = new HashSet(Arrays.asList(left, right)); + QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals); + + return merged; + } + + public double getPenalty() { + return calcPenalty(getErrorRate()); + } + + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subIntervals as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenalty(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( subIntervals.isEmpty() ) { + // this is leave node + if ( this.qEnd <= minInterestingQual ) + // It's free to merge up quality scores below the smallest interesting one + return 0; + else { + return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations; + } + } else { + double sum = 0; + for ( final QualInterval interval : subIntervals ) + sum += interval.calcPenalty(globalErrorRate); + return sum; + } + } + } + + /** + * Main method for computing the quantization intervals. + * + * Invoked in the constructor after all input variables are initialized. Walks + * over the inputs and builds the min. penalty forest of intervals with exactly nLevel + * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed + * to find the optimal combination. + * + * TODO: develop a smarter algorithm + * + * @return the forest of intervals with size == nLevels + */ + @Ensures({"! result.isEmpty()", "result.size() == nLevels"}) + private TreeSet quantize() { + // create intervals for each qual individually + final TreeSet intervals = new TreeSet(); + for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) { + final long nObs = nObservationsPerQual.get(qStart); + final double errorRate = QualityUtils.qualToErrorProb((byte)qStart); + final double nErrors = nObs * errorRate; + final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart); + intervals.add(qi); + } + + // greedy algorithm: + // while ( n intervals >= nLevels ): + // find intervals to merge with least penalty + // merge it + while ( intervals.size() > nLevels ) { + mergeLowestPenaltyIntervals(intervals); + } + + return intervals; + } + + /** + * Helper function that finds and merges together the lowest penalty pair of intervals + * @param intervals + */ + @Requires("! intervals.isEmpty()") + private void mergeLowestPenaltyIntervals(final TreeSet intervals) { + // setup the iterators + final Iterator it1 = intervals.iterator(); + final Iterator it1p = intervals.iterator(); + it1p.next(); // skip one + + // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty + QualInterval minMerge = null; + if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size()); + int lastMergeOrder = 0; + while ( it1p.hasNext() ) { + final QualInterval left = it1.next(); + final QualInterval right = it1p.next(); + final QualInterval merged = left.merge(right); + lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder); + if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) { + if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge); + minMerge = merged; + } + } + + // now actually go ahead and merge the minMerge pair + if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge); + intervals.removeAll(minMerge.subIntervals); + intervals.add(minMerge); + minMerge.mergeOrder = lastMergeOrder + 1; + if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals); + } + + /** + * Given a final forest of intervals constructs a list mapping + * list.get(i) => quantized qual to use for original quality score i + * + * This function should be called only once to initialize the corresponding + * cached value in this object, as the calculation is a bit costly. + * + * @param intervals + * @return + */ + @Ensures("result.size() == getNQualsInHistogram()") + private List intervalsToMap(final TreeSet intervals) { + final List map = new ArrayList(getNQualsInHistogram()); + map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE)); + for ( final QualInterval interval : intervals ) { + for ( int q = interval.qStart; q <= interval.qEnd; q++ ) { + map.set(q, interval.getQual()); + } + } + + if ( Collections.min(map) == Byte.MIN_VALUE ) + throw new ReviewedGATKException("quantized quality score map contains an un-initialized value"); + + return map; + } + + @Ensures("result > 0") + private final int getNQualsInHistogram() { + return nObservationsPerQual.size(); + } + + /** + * Write out a GATKReport to visualize the QualQuantization process of this data + * @param out + */ + public void writeReport(PrintStream out) { + final GATKReport report = new GATKReport(); + + addQualHistogramToReport(report); + addIntervalsToReport(report); + + report.print(out); + } + + private final void addQualHistogramToReport(final GATKReport report) { + report.addTable("QualHistogram", "Quality score histogram provided to report", 2); + GATKReportTable table = report.getTable("QualHistogram"); + + table.addColumn("qual"); + table.addColumn("count"); + + for ( int q = 0; q < nObservationsPerQual.size(); q++ ) { + table.set(q, "qual", q); + table.set(q, "count", nObservationsPerQual.get(q)); + } + } + + + private final void addIntervalsToReport(final GATKReport report) { + report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals", 10); + GATKReportTable table = report.getTable("QualQuantizerIntervals"); + + table.addColumn("name"); + table.addColumn("qStart"); + table.addColumn("qEnd"); + table.addColumn("level"); + table.addColumn("merge.order"); + table.addColumn("nErrors"); + table.addColumn("nObservations"); + table.addColumn("qual"); + table.addColumn("penalty"); + table.addColumn("root.node"); + //table.addColumn("subintervals", "NA"); + + for ( QualInterval interval : quantizedIntervals ) + addIntervalToReport(table, interval, true); + } + + private final void addIntervalToReport(final GATKReportTable table, final QualInterval interval, final boolean atRootP) { + final String name = interval.getName(); + table.set(name, "name", name); + table.set(name, "qStart", interval.qStart); + table.set(name, "qEnd", interval.qEnd); + table.set(name, "level", interval.level); + table.set(name, "merge.order", interval.mergeOrder); + table.set(name, "nErrors", interval.nErrors); + table.set(name, "nObservations", interval.nObservations); + table.set(name, "qual", interval.getQual()); + table.set(name, "penalty", String.format("%.1f", interval.getPenalty())); + table.set(name, "root.node", atRootP); + + for ( final QualInterval sub : interval.subIntervals ) + addIntervalToReport(table, sub, false); + } + + public List getOriginalToQuantizedMap() { + return originalToQuantizedMap; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java new file mode 100644 index 000000000..e054805af --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/QuantizationInfo.java @@ -0,0 +1,151 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; + +import java.util.Arrays; +import java.util.List; + +/** + * Class that encapsulates the information necessary for quality score quantization for BQSR + * + * @author carneiro + * @since 3/26/12 + */ +public class QuantizationInfo { + private List quantizedQuals; + private List empiricalQualCounts; + private int quantizationLevels; + + private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { + this.quantizedQuals = quantizedQuals; + this.empiricalQualCounts = empiricalQualCounts; + this.quantizationLevels = quantizationLevels; + } + + public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) { + this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); + } + + public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { + final Long [] qualHistogram = new Long[QualityUtils.MAX_SAM_QUAL_SCORE +1]; // create a histogram with the empirical quality distribution + for (int i = 0; i < qualHistogram.length; i++) + qualHistogram[i] = 0L; + + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table + + for (final RecalDatum value : qualTable.getAllValues()) { + final RecalDatum datum = value; + final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key + } + empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities + quantizeQualityScores(quantizationLevels); + + this.quantizationLevels = quantizationLevels; + } + + + public void quantizeQualityScores(int nLevels) { + QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels + quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) + } + + public void noQuantization() { + this.quantizationLevels = QualityUtils.MAX_SAM_QUAL_SCORE; + for (int i = 0; i < this.quantizationLevels; i++) + quantizedQuals.set(i, (byte) i); + } + + public List getQuantizedQuals() { + return quantizedQuals; + } + + public int getQuantizationLevels() { + return quantizationLevels; + } + + public GATKReportTable generateReportTable(boolean sortByCols) { + GATKReportTable quantizedTable; + if(sortByCols) { + quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); + } + quantizedTable.addColumn(RecalUtils.QUALITY_SCORE_COLUMN_NAME); + quantizedTable.addColumn(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); + quantizedTable.addColumn(RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); + + for (int qual = 0; qual <= QualityUtils.MAX_SAM_QUAL_SCORE; qual++) { + quantizedTable.set(qual, RecalUtils.QUALITY_SCORE_COLUMN_NAME, qual); + quantizedTable.set(qual, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual)); + quantizedTable.set(qual, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual)); + } + return quantizedTable; + } + + private static int calculateQuantizationLevels(List quantizedQuals) { + byte lastByte = -1; + int quantizationLevels = 0; + for (byte q : quantizedQuals) { + if (q != lastByte) { + quantizationLevels++; + lastByte = q; + } + } + return quantizationLevels; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java new file mode 100644 index 000000000..c02dd4881 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariates.java @@ -0,0 +1,176 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.LRUCache; +import org.broadinstitute.gatk.utils.recalibration.EventType; + +/** + * The object temporarily held by a read that describes all of it's covariates. + * + * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap + * + * @author Mauricio Carneiro + * @since 2/8/12 + */ +public class ReadCovariates { + private final static Logger logger = Logger.getLogger(ReadCovariates.class); + + /** + * How big should we let the LRU cache grow + */ + private static final int LRU_CACHE_SIZE = 500; + + /** + * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. + * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU + * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. + * + * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE + */ + private final static ThreadLocal> keysCache = new ThreadLocal>() { + @Override protected LRUCache initialValue() { + return new LRUCache(LRU_CACHE_SIZE); + } + }; + + /** + * The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis. + * If running multiple analyses (or the unit test suite), it's necessary to clear the cache. + */ + public static void clearKeysCache() { + keysCache.remove(); + } + + /** + * Our keys, indexed by event type x read length x covariate + */ + private final int[][][] keys; + + /** + * The index of the current covariate, used by addCovariate + */ + private int currentCovariateIndex = 0; + + public ReadCovariates(final int readLength, final int numberOfCovariates) { + final LRUCache cache = keysCache.get(); + final int[][][] cachedKeys = cache.get(readLength); + if ( cachedKeys == null ) { + // There's no cached value for read length so we need to create a new int[][][] array + if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); + keys = new int[EventType.values().length][readLength][numberOfCovariates]; + cache.put(readLength, keys); + } else { + keys = cachedKeys; + } + } + + public void setCovariateIndex(final int index) { + currentCovariateIndex = index; + } + + /** + * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset + * + * NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases + * after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently + * only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs. + * + * @param mismatch the mismatch key value + * @param insertion the insertion key value + * @param deletion the deletion key value + * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates + */ + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { + keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; + } + + /** + * Get the keys for all covariates at read position for error model + * + * @param readPosition + * @param errorModel + * @return + */ + public int[] getKeySet(final int readPosition, final EventType errorModel) { + return keys[errorModel.ordinal()][readPosition]; + } + + public int[][] getKeySet(final EventType errorModel) { + return keys[errorModel.ordinal()]; + } + + // ---------------------------------------------------------------------- + // + // routines for testing + // + // ---------------------------------------------------------------------- + + protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } + protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } + protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } + + protected int[] getMismatchesKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); + } + + protected int[] getInsertionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_INSERTION); + } + + protected int[] getDeletionsKeySet(final int readPosition) { + return getKeySet(readPosition, EventType.BASE_DELETION); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java new file mode 100644 index 000000000..c92ef1773 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatum.java @@ -0,0 +1,434 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import htsjdk.samtools.SAMUtils; +import org.apache.commons.math.optimization.fitting.GaussianFunction; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; + + +/** + * An individual piece of recalibration data. Each bin counts up the number of observations and the number + * of reference mismatches seen for that combination of covariates. + * + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + */ +@Invariant({ + "estimatedQReported >= 0.0", + "! Double.isNaN(estimatedQReported)", + "! Double.isInfinite(estimatedQReported)", + "empiricalQuality >= 0.0 || empiricalQuality == UNINITIALIZED", + "! Double.isNaN(empiricalQuality)", + "! Double.isInfinite(empiricalQuality)", + "numObservations >= 0", + "numMismatches >= 0", + "numMismatches <= numObservations" +}) +public class RecalDatum { + public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; + private static final double UNINITIALIZED = -1.0; + + /** + * estimated reported quality score based on combined data's individual q-reporteds and number of observations + */ + private double estimatedQReported; + + /** + * the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) + */ + private double empiricalQuality; + + /** + * number of bases seen in total + */ + private long numObservations; + + /** + * number of bases seen that didn't match the reference + */ + private double numMismatches; + + /** + * used when calculating empirical qualities to avoid division by zero + */ + private static final int SMOOTHING_CONSTANT = 1; + + //--------------------------------------------------------------------------------------------------------------- + // + // constructors + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Create a new RecalDatum with given observation and mismatch counts, and an reported quality + * + * @param _numObservations observations + * @param _numMismatches mismatches + * @param reportedQuality Qreported + */ + public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) { + if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); + if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0"); + if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); + + numObservations = _numObservations; + numMismatches = _numMismatches; + estimatedQReported = reportedQuality; + empiricalQuality = UNINITIALIZED; + } + + /** + * Copy copy into this recal datum, overwriting all of this objects data + * @param copy RecalDatum to copy + */ + public RecalDatum(final RecalDatum copy) { + this.numObservations = copy.getNumObservations(); + this.numMismatches = copy.getNumMismatches(); + this.estimatedQReported = copy.estimatedQReported; + this.empiricalQuality = copy.empiricalQuality; + } + + /** + * Add in all of the data from other into this object, updating the reported quality from the expected + * error rate implied by the two reported qualities + * + * @param other RecalDatum to combine + */ + public synchronized void combine(final RecalDatum other) { + final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); + increment(other.getNumObservations(), other.getNumMismatches()); + estimatedQReported = -10 * Math.log10(sumErrors / getNumObservations()); + empiricalQuality = UNINITIALIZED; + } + + public synchronized void setEstimatedQReported(final double estimatedQReported) { + if ( estimatedQReported < 0 ) throw new IllegalArgumentException("estimatedQReported < 0"); + if ( Double.isInfinite(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is infinite"); + if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN"); + + this.estimatedQReported = estimatedQReported; + empiricalQuality = UNINITIALIZED; + } + + public final double getEstimatedQReported() { + return estimatedQReported; + } + public final byte getEstimatedQReportedAsByte() { + return (byte)(int)(Math.round(getEstimatedQReported())); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // Empirical quality score -- derived from the num mismatches and observations + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Returns the error rate (in real space) of this interval, or 0 if there are no observations + * @return the empirical error rate ~= N errors / N obs + */ + @Ensures({"result >= 0.0"}) + public double getEmpiricalErrorRate() { + if ( numObservations == 0 ) + return 0.0; + else { + // cache the value so we don't call log over and over again + final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT; + // smoothing is one error and one non-error observation, for example + final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; + return doubleMismatches / doubleObservations; + } + } + + public synchronized void setEmpiricalQuality(final double empiricalQuality) { + if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0"); + if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite"); + if ( Double.isNaN(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is NaN"); + + this.empiricalQuality = empiricalQuality; + } + + public final double getEmpiricalQuality() { + return getEmpiricalQuality(getEstimatedQReported()); + } + + public synchronized final double getEmpiricalQuality(final double conditionalPrior) { + if (empiricalQuality == UNINITIALIZED) { + calcEmpiricalQuality(conditionalPrior); + } + return empiricalQuality; + } + + public final byte getEmpiricalQualityAsByte() { + return (byte)(Math.round(getEmpiricalQuality())); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // toString methods + // + //--------------------------------------------------------------------------------------------------------------- + + @Override + public String toString() { + return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); + } + + public String stringForCSV() { + return String.format("%s,%.2f,%.2f", toString(), getEstimatedQReported(), getEmpiricalQuality() - getEstimatedQReported()); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + public final long getNumObservations() { + return numObservations; + } + + public final synchronized void setNumObservations(final long numObservations) { + if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); + this.numObservations = numObservations; + empiricalQuality = UNINITIALIZED; + } + + public final double getNumMismatches() { + return numMismatches; + } + + @Requires({"numMismatches >= 0"}) + public final synchronized void setNumMismatches(final double numMismatches) { + if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); + this.numMismatches = numMismatches; + empiricalQuality = UNINITIALIZED; + } + + @Requires({"by >= 0"}) + public final synchronized void incrementNumObservations(final long by) { + numObservations += by; + empiricalQuality = UNINITIALIZED; + } + + @Requires({"by >= 0"}) + public final synchronized void incrementNumMismatches(final double by) { + numMismatches += by; + empiricalQuality = UNINITIALIZED; + } + + @Requires({"incObservations >= 0", "incMismatches >= 0"}) + @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) + public final synchronized void increment(final long incObservations, final double incMismatches) { + numObservations += incObservations; + numMismatches += incMismatches; + empiricalQuality = UNINITIALIZED; + } + + @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) + public final synchronized void increment(final boolean isError) { + increment(1, isError ? 1.0 : 0.0); + } + + // ------------------------------------------------------------------------------------- + // + // Private implementation helper functions + // + // ------------------------------------------------------------------------------------- + + /** + * calculate the expected number of errors given the estimated Q reported and the number of observations + * in this datum. + * + * @return a positive (potentially fractional) estimate of the number of errors + */ + @Ensures("result >= 0.0") + private double calcExpectedErrors() { + return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); + } + + /** + * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) + */ + @Requires("empiricalQuality == UNINITIALIZED") + @Ensures("empiricalQuality != UNINITIALIZED") + private synchronized void calcEmpiricalQuality(final double conditionalPrior) { + + // smoothing is one error and one non-error observation + final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; + final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; + + final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, conditionalPrior); + + // This is the old and busted point estimate approach: + //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); + + empiricalQuality = Math.min(empiricalQual, (double) MAX_RECALIBRATED_Q_SCORE); + } + + //static final boolean DEBUG = false; + static private final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) { + + final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; + + final double[] log10Posteriors = new double[numBins]; + + for ( int bin = 0; bin < numBins; bin++ ) { + + final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; + + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors); + + //if ( DEBUG ) + // System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + } + + //if ( DEBUG ) + // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + + final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); + final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); + + final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; + return Qemp; + } + + /** + * Quals above this value should be capped down to this value (because they are too high) + * in the base quality score recalibrator + */ + public final static byte MAX_GATK_USABLE_Q_SCORE = 40; + static private final double[] log10QempPriorCache = new double[MAX_GATK_USABLE_Q_SCORE + 1]; + static { + // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) + // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". + final double GF_a = 0.0; + final double GF_b = 0.9; + final double GF_c = 0.0; + final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points + + final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); + for ( int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++ ) { + double log10Prior = Math.log10(gaussian.value((double) i)); + if ( Double.isInfinite(log10Prior) ) + log10Prior = -Double.MAX_VALUE; + log10QempPriorCache[i] = log10Prior; + } + } + + static protected double log10QempPrior(final double Qempirical, final double Qreported) { + final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), MAX_GATK_USABLE_Q_SCORE); + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + return log10QempPriorCache[difference]; + } + + static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1; + + static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) { + if ( nObservations == 0 ) + return 0.0; + + // the binomial code requires ints as input (because it does caching). This should theoretically be fine because + // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow + // before casting down to an int. + if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) { + // we need to decrease nErrors by the same fraction that we are decreasing nObservations + final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations; + nErrors = Math.round((double)nErrors * fraction); + nObservations = MAX_NUMBER_OF_OBSERVATIONS; + } + + // this is just a straight binomial PDF + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical)); + if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) + log10Prob = -Double.MAX_VALUE; + + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + + return log10Prob; + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java new file mode 100644 index 000000000..14b4c762b --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumNode.java @@ -0,0 +1,582 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.math.MathException; +import org.apache.commons.math.stat.inference.ChiSquareTestImpl; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.Collection; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Set; + +/** + * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one + * + * @author Mark DePristo + * @since 07/27/12 + */ +public class RecalDatumNode { + private final static double SMALLEST_CHI2_PVALUE = 1e-300; + protected static final Logger logger = Logger.getLogger(RecalDatumNode.class); + + /** + * fixedPenalty is this value if it's considered fixed + */ + private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY; + + private final T recalDatum; + private double fixedPenalty = UNINITIALIZED; + private final Set> subnodes; + + @Requires({"recalDatum != null"}) + public RecalDatumNode(final T recalDatum) { + this(recalDatum, new HashSet>()); + } + + @Override + public String toString() { + return recalDatum.toString(); + } + + @Requires({"recalDatum != null", "subnodes != null"}) + public RecalDatumNode(final T recalDatum, final Set> subnodes) { + this(recalDatum, UNINITIALIZED, subnodes); + } + + @Requires({"recalDatum != null"}) + protected RecalDatumNode(final T recalDatum, final double fixedPenalty) { + this(recalDatum, fixedPenalty, new HashSet>()); + } + + @Requires({"recalDatum != null", "subnodes != null"}) + protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set> subnodes) { + this.recalDatum = recalDatum; + this.fixedPenalty = fixedPenalty; + this.subnodes = new HashSet>(subnodes); + } + + /** + * Get the recal data associated with this node + * @return + */ + @Ensures("result != null") + public T getRecalDatum() { + return recalDatum; + } + + /** + * The set of all subnodes of this tree. May be modified. + * @return + */ + @Ensures("result != null") + public Set> getSubnodes() { + return subnodes; + } + + /** + * Return the fixed penalty, if set, or else the the calculated penalty for this node + * @return + */ + public double getPenalty() { + if ( fixedPenalty != UNINITIALIZED ) + return fixedPenalty; + else + return calcPenalty(); + } + + /** + * Set the fixed penalty for this node to a fresh calculation from calcPenalty + * + * This is important in the case where you want to compute the penalty from a full + * tree and then chop the tree up afterwards while considering the previous penalties. + * If you don't call this function then manipulating the tree may result in the + * penalty functions changing with changes in the tree. + * + * @param doEntireTree recurse into all subnodes? + * @return the fixed penalty for this node + */ + public double calcAndSetFixedPenalty(final boolean doEntireTree) { + fixedPenalty = calcPenalty(); + if ( doEntireTree ) + for ( final RecalDatumNode sub : subnodes ) + sub.calcAndSetFixedPenalty(doEntireTree); + return fixedPenalty; + } + + /** + * Add node to the set of subnodes of this node + * @param sub + */ + @Requires("sub != null") + public void addSubnode(final RecalDatumNode sub) { + subnodes.add(sub); + } + + /** + * Is this a leaf node (i.e., has no subnodes)? + * @return + */ + public boolean isLeaf() { + return subnodes.isEmpty(); + } + + /** + * Is this node immediately above only leaf nodes? + * + * @return + */ + public boolean isAboveOnlyLeaves() { + for ( final RecalDatumNode sub : subnodes ) + if ( ! sub.isLeaf() ) + return false; + return true; + } + + /** + * What's the immediate number of subnodes from this node? + * @return + */ + @Ensures("result >= 0") + public int getNumSubnodes() { + return subnodes.size(); + } + + /** + * Total penalty is the sum of leaf node penalties + * + * This algorithm assumes that penalties have been fixed before pruning, as leaf nodes by + * definition have 0 penalty unless they represent a pruned tree with underlying -- but now + * pruned -- subtrees + * + * @return + */ + public double totalPenalty() { + if ( isLeaf() ) + return getPenalty(); + else { + double sum = 0.0; + for ( final RecalDatumNode sub : subnodes ) + sum += sub.totalPenalty(); + return sum; + } + } + + /** + * The maximum penalty among all nodes + * @return + */ + public double maxPenalty(final boolean leafOnly) { + double max = ! leafOnly || isLeaf() ? getPenalty() : Double.MIN_VALUE; + for ( final RecalDatumNode sub : subnodes ) + max = Math.max(max, sub.maxPenalty(leafOnly)); + return max; + } + + /** + * The minimum penalty among all nodes + * @return + */ + public double minPenalty(final boolean leafOnly) { + double min = ! leafOnly || isLeaf() ? getPenalty() : Double.MAX_VALUE; + for ( final RecalDatumNode sub : subnodes ) + min = Math.min(min, sub.minPenalty(leafOnly)); + return min; + } + + /** + * What's the longest branch from this node to any leaf? + * @return + */ + public int maxDepth() { + int subMax = 0; + for ( final RecalDatumNode sub : subnodes ) + subMax = Math.max(subMax, sub.maxDepth()); + return subMax + 1; + } + + /** + * What's the shortest branch from this node to any leaf? Includes this node + * @return + */ + @Ensures("result > 0") + public int minDepth() { + if ( isLeaf() ) + return 1; + else { + int subMin = Integer.MAX_VALUE; + for ( final RecalDatumNode sub : subnodes ) + subMin = Math.min(subMin, sub.minDepth()); + return subMin + 1; + } + } + + /** + * Return the number of nodes, including this one, reachable from this node + * @return + */ + @Ensures("result > 0") + public int size() { + int size = 1; + for ( final RecalDatumNode sub : subnodes ) + size += sub.size(); + return size; + } + + /** + * Count the number of leaf nodes reachable from this node + * + * @return + */ + @Ensures("result >= 0") + public int numLeaves() { + if ( isLeaf() ) + return 1; + else { + int size = 0; + for ( final RecalDatumNode sub : subnodes ) + size += sub.numLeaves(); + return size; + } + } + + /** + * Calculate the phred-scaled p-value for a chi^2 test for independent among subnodes of this node. + * + * The chi^2 value indicates the degree of independence of the implied error rates among the + * immediate subnodes + * + * @return the phred-scaled p-value for chi2 penalty, or 0.0 if it cannot be calculated + */ + private double calcPenalty() { + if ( isLeaf() || freeToMerge() ) + return 0.0; + else if ( subnodes.size() == 1 ) + // only one value, so its free to merge away + return 0.0; + else { + final long[][] counts = new long[subnodes.size()][2]; + + int i = 0; + for ( final RecalDatumNode subnode : subnodes ) { + // use the yates correction to help avoid all zeros => NaN + counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; + counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L; + i++; + } + + try { + final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts); + final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); + + // make sure things are reasonable and fail early if not + if (Double.isInfinite(penalty) || Double.isNaN(penalty)) + throw new ReviewedGATKException("chi2 value is " + chi2PValue + " at " + getRecalDatum()); + + return penalty; + } catch ( MathException e ) { + throw new ReviewedGATKException("Failed in calculating chi2 value", e); + } + } + } + + /** + * Is this node free to merge because its rounded Q score is the same as all nodes below + * @return + */ + private boolean freeToMerge() { + if ( isLeaf() ) // leaves are free to merge + return true; + else { + final byte myQual = getRecalDatum().getEmpiricalQualityAsByte(); + for ( final RecalDatumNode sub : subnodes ) + if ( sub.getRecalDatum().getEmpiricalQualityAsByte() != myQual ) + return false; + return true; + } + } + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subnodes as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenaltyLog10(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( isLeaf() ) { + // this is leave node + return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations(); + // TODO -- how we can generalize this calculation? +// if ( this.qEnd <= minInterestingQual ) +// // It's free to merge up quality scores below the smallest interesting one +// return 0; +// else { +// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); +// } + } else { + double sum = 0; + for ( final RecalDatumNode hrd : subnodes) + sum += hrd.calcPenaltyLog10(globalErrorRate); + return sum; + } + } + + /** + * Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf + * + * @param maxDepth + * @return + */ + public RecalDatumNode pruneToDepth(final int maxDepth) { + if ( maxDepth < 1 ) + throw new IllegalArgumentException("maxDepth < 1"); + else { + final Set> subPruned = new HashSet>(getNumSubnodes()); + if ( maxDepth > 1 ) + for ( final RecalDatumNode sub : subnodes ) + subPruned.add(sub.pruneToDepth(maxDepth - 1)); + return new RecalDatumNode(getRecalDatum(), fixedPenalty, subPruned); + } + } + + /** + * Return a freshly allocated tree with to no more than maxElements in order of penalty + * + * Note that nodes must have fixed penalties to this algorithm will fail. + * + * @param maxElements + * @return + */ + public RecalDatumNode pruneByPenalty(final int maxElements) { + RecalDatumNode root = this; + + while ( root.size() > maxElements ) { + // remove the lowest penalty element, and continue + root = root.removeLowestPenaltyNode(); + } + + // our size is below the target, so we are good, return + return root; + } + + /** + * Return a freshly allocated tree where all mergable nodes with < maxPenalty are merged + * + * Note that nodes must have fixed penalties to this algorithm will fail. + * + * @param maxPenaltyIn the maximum penalty we are allowed to incur for a merge + * @param applyBonferroniCorrection if true, we will adjust penalty by the phred-scaled bonferroni correction + * for the size of the initial tree. That is, if there are 10 nodes in the + * tree and maxPenalty is 20 we will actually enforce 10^-2 / 10 = 10^-3 = 30 + * penalty for multiple testing + * @return + */ + public RecalDatumNode pruneToNoMoreThanPenalty(final double maxPenaltyIn, final boolean applyBonferroniCorrection) { + RecalDatumNode root = this; + + final double bonferroniCorrection = 10 * Math.log10(this.size()); + final double maxPenalty = applyBonferroniCorrection ? maxPenaltyIn + bonferroniCorrection : maxPenaltyIn; + + if ( applyBonferroniCorrection ) + logger.info(String.format("Applying Bonferroni correction for %d nodes = %.2f to initial penalty %.2f for total " + + "corrected max penalty of %.2f", this.size(), bonferroniCorrection, maxPenaltyIn, maxPenalty)); + + while ( true ) { + final Pair, Double> minPenaltyNode = root.getMinPenaltyAboveLeafNode(); + + if ( minPenaltyNode == null || minPenaltyNode.getSecond() > maxPenalty ) { + // nothing to merge, or the best candidate is above our max allowed + if ( minPenaltyNode == null ) { + if ( logger.isDebugEnabled() ) logger.debug("Stopping because no candidates could be found"); + } else { + if ( logger.isDebugEnabled() ) logger.debug("Stopping because node " + minPenaltyNode.getFirst() + " has penalty " + minPenaltyNode.getSecond() + " > max " + maxPenalty); + } + break; + } else { + // remove the lowest penalty element, and continue + if ( logger.isDebugEnabled() ) logger.debug("Removing node " + minPenaltyNode.getFirst() + " with penalty " + minPenaltyNode.getSecond()); + root = root.removeLowestPenaltyNode(); + } + } + + // no more candidates exist with penalty < maxPenalty + return root; + } + + + /** + * Find the lowest penalty above leaf node in the tree, and return a tree without it + * + * Note this excludes the current (root) node + * + * @return + */ + private RecalDatumNode removeLowestPenaltyNode() { + final Pair, Double> nodeToRemove = getMinPenaltyAboveLeafNode(); + if ( logger.isDebugEnabled() ) + logger.debug("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond()); + + final Pair, Boolean> result = removeNode(nodeToRemove.getFirst()); + + if ( ! result.getSecond() ) + throw new IllegalStateException("Never removed any node!"); + + final RecalDatumNode oneRemoved = result.getFirst(); + if ( oneRemoved == null ) + throw new IllegalStateException("Removed our root node, wow, didn't expect that"); + return oneRemoved; + } + + /** + * Finds in the tree the node with the lowest penalty whose subnodes are all leaves + * + * @return the node and its penalty, or null if no such node exists + */ + private Pair, Double> getMinPenaltyAboveLeafNode() { + if ( isLeaf() ) + // not allowed to remove leafs directly + return null; + if ( isAboveOnlyLeaves() ) + // we only consider removing nodes above all leaves + return new Pair, Double>(this, getPenalty()); + else { + // just recurse, taking the result with the min penalty of all subnodes + Pair, Double> minNode = null; + for ( final RecalDatumNode sub : subnodes ) { + final Pair, Double> subFind = sub.getMinPenaltyAboveLeafNode(); + if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) { + minNode = subFind; + } + } + return minNode; + } + } + + /** + * Return a freshly allocated tree without the node nodeToRemove + * + * @param nodeToRemove + * @return + */ + private Pair, Boolean> removeNode(final RecalDatumNode nodeToRemove) { + if ( this == nodeToRemove ) { + if ( isLeaf() ) + throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + nodeToRemove); + // node is the thing we are going to remove, but without any subnodes + final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty); + return new Pair, Boolean>(node, true); + } else { + // did we remove something in a sub branch? + boolean removedSomething = false; + + // our sub nodes with the penalty node removed + final Set> sub = new HashSet>(getNumSubnodes()); + + for ( final RecalDatumNode sub1 : subnodes ) { + if ( removedSomething ) { + // already removed something, just add sub1 back to sub + sub.add(sub1); + } else { + // haven't removed anything yet, so try + final Pair, Boolean> maybeRemoved = sub1.removeNode(nodeToRemove); + removedSomething = maybeRemoved.getSecond(); + sub.add(maybeRemoved.getFirst()); + } + } + + final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty, sub); + return new Pair, Boolean>(node, removedSomething); + } + } + + /** + * Return a collection of all of the data in the leaf nodes of this tree + * + * @return + */ + public Collection getAllLeaves() { + final LinkedList list = new LinkedList(); + getAllLeavesRec(list); + return list; + } + + /** + * Helpful recursive function for getAllLeaves() + * + * @param list the destination for the list of leaves + */ + private void getAllLeavesRec(final LinkedList list) { + if ( isLeaf() ) + list.add(getRecalDatum()); + else { + for ( final RecalDatumNode sub : subnodes ) + sub.getAllLeavesRec(list); + } + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java new file mode 100644 index 000000000..f2f33ee59 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalUtils.java @@ -0,0 +1,1097 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.recalibration.covariates.*; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.R.RScriptExecutor; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.io.Resource; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 6, 2009 + * + * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. + * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. + * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads. + */ + +public class RecalUtils { + public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; + public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; + public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; + public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; + public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; + + public final static String ARGUMENT_COLUMN_NAME = "Argument"; + public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; + public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; + public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; + public final static String READGROUP_COLUMN_NAME = "ReadGroup"; + public final static String EVENT_TYPE_COLUMN_NAME = "EventType"; + public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality"; + public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported"; + public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; + public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; + public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; + public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; + public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; + + private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + private static boolean warnUserNullPlatform = false; + + private static final String SCRIPT_FILE = "BQSR.R"; + + private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s"); + private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s"); + private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); + private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); + private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); + private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); + private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); + + /** + * Generates two lists : required covariates and optional covariates based on the user's requests. + * + * Performs the following tasks in order: + * 1. Adds all requierd covariates in order + * 2. Check if the user asked to use the standard covariates and adds them all if that's the case + * 3. Adds all covariates requested by the user that were not already added by the two previous steps + * + * @param argumentCollection the argument collection object for the recalibration walker + * @return a pair of ordered lists : required covariates (first) and optional covariates (second) + */ + public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); + + final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates + ArrayList optionalCovariates = new ArrayList(); + if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) + optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user + + // parse the -cov arguments that were provided, skipping over the ones already specified + if (argumentCollection.COVARIATES != null) { + for (String requestedCovariateString : argumentCollection.COVARIATES) { + // help the transition from BQSR v1 to BQSR v2 + if ( requestedCovariateString.equals("DinucCovariate") ) + throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " + + "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " + + "as well as an indel context to model the indel error rates"); + + boolean foundClass = false; + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class + foundClass = true; + if (!requiredClasses.contains(covClass) && + (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { + try { + final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it + optionalCovariates.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + } + } + + if (!foundClass) { + throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); + } + } + } + return new Pair, ArrayList>(requiredCovariates, optionalCovariates); + } + + /** + * Adds the required covariates to a covariate list + * + * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addRequiredCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + if (classes.size() != 2) + throw new ReviewedGATKException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); + + dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. + dest.add(new QualityScoreCovariate()); + return dest; + } + + /** + * Adds the standard covariates to a covariate list + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addStandardCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + for (Class covClass : classes) { + try { + final Covariate covariate = (Covariate) covClass.newInstance(); + dest.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + return dest; + } + + /** + * Print a list of all available covariates to logger as info + * + * @param logger + */ + public static void listAvailableCovariates(final Logger logger) { + logger.info("Available covariates:"); + for (final Class covClass : new PluginManager(Covariate.class).getPlugins()) { + logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass))); + } + } + + /** + * Component used to print out csv representation of the reports that can be use to perform analysis in + * external tools. E.g. generate plots using R scripts. + *

+ * A header is always printed into the output stream (or file) when the printer is created. Then you only need + * to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file. + * Once finished, you close the printer calling {@link #close() close} + * + */ + private static class CsvPrinter { + + private final PrintStream ps; + private final Covariate[] covariates; + + /** + * Constructs a printer redirected to an output file. + * @param out the output file. + * @param c covariates to print out. + * @throws FileNotFoundException if the file could not be created anew. + */ + protected CsvPrinter(final File out, final Covariate ... c) + throws FileNotFoundException { + this(new FileOutputStream(out), c); + } + + /** + * Constructs a printer redirected to an output stream + * @param os the output. + * @param c covariates to print out. + */ + protected CsvPrinter(final OutputStream os, final Covariate ... c) { + covariates = c == null ? new Covariate[0] : c.clone(); + ps = new PrintStream(os); + printHeader(); + } + + /** + * Prints the header out. + *

+ * Should only be invoked at creation. + */ + protected void printHeader() { + RecalUtils.printHeader(ps); + } + + /** + * Prints out a report into the csv file. + * + * + * @param report the report to print out. + * @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED + */ + public void print(final RecalibrationReport report, final String mode) { + RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false); + } + + /** + * Close the csv printer. + * + * No further output will be allowed or take place after calling this method. + */ + public void close() { + ps.close(); + } + + } + + /** + * Returns a csv output printer. + * + * @param out the output file. It will be overridden + * @param c list of covariates to print out. + * + * @throws FileNotFoundException if out could not be created anew. + * + * @return never null + */ + protected static CsvPrinter csvPrinter(final File out, final Covariate ... c) + throws FileNotFoundException + { + if (c == null) { + throw new IllegalArgumentException("the input covariate array cannot be null"); + } + return new CsvPrinter(out,c); + } + + /** + * Prints out a collection of reports into a file in Csv format in a way + * that can be used by R scripts (such as the plot generator script). + *

+ * The set of covariates is take as the minimum common set from all reports. + * + * @param out the output file. It will be overridden. + * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) + * of each report and the corresponding value the report itself. + * @throws FileNotFoundException if out could not be created anew. + */ + public static void generateCsv(final File out, final Map reports) + throws FileNotFoundException { + if (reports.size() == 0) { + writeCsv(out, reports, new Covariate[0]); + } else { + final Iterator rit = reports.values().iterator(); + final RecalibrationReport first = rit.next(); + final Covariate[] firstCovariates = first.getRequestedCovariates(); + final Set covariates = new LinkedHashSet<>(); + Utils.addAll(covariates,firstCovariates); + while (rit.hasNext() && covariates.size() > 0) { + final Covariate[] nextCovariates = rit.next().getRequestedCovariates(); + final Set nextCovariateNames = new LinkedHashSet(nextCovariates.length); + for (final Covariate nc : nextCovariates) { + nextCovariateNames.add(nc.getClass().getSimpleName()); + } + final Iterator cit = covariates.iterator(); + while (cit.hasNext()) { + if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) { + cit.remove(); + } + } + } + writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()])); + } + } + + /** + * Print out a collection of reports into a file in Csv format in a way + * that can be used by R scripts (such as the plot generator script). + * + * @param out + * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) + * of each report and the corresponding value the report itself. + * @param c the covariates to print out. + * @throws FileNotFoundException if out could not be created anew. + */ + private static void writeCsv(final File out, + final Map reports, final Covariate[] c) + throws FileNotFoundException { + final CsvPrinter p = csvPrinter(out,c); + for (Map.Entry e : reports.entrySet()) { + p.print(e.getValue(),e.getKey()); + } + p.close(); + } + + public enum SOLID_RECAL_MODE { + /** + * Treat reference inserted bases as reference matching bases. Very unsafe! + */ + DO_NOTHING, + /** + * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. + */ + SET_Q_ZERO, + /** + * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. + */ + SET_Q_ZERO_BASE_N, + /** + * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. + */ + REMOVE_REF_BIAS; + + public static SOLID_RECAL_MODE recalModeFromString(String recalMode) { + if (recalMode.equals("DO_NOTHING")) + return SOLID_RECAL_MODE.DO_NOTHING; + if (recalMode.equals("SET_Q_ZERO")) + return SOLID_RECAL_MODE.SET_Q_ZERO; + if (recalMode.equals("SET_Q_ZERO_BASE_N")) + return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N; + if (recalMode.equals("REMOVE_REF_BIAS")) + return SOLID_RECAL_MODE.REMOVE_REF_BIAS; + + throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value"); + } + } + + public enum SOLID_NOCALL_STRATEGY { + /** + * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. + */ + THROW_EXCEPTION, + /** + * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. + */ + LEAVE_READ_UNRECALIBRATED, + /** + * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. + */ + PURGE_READ; + + public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) { + if (nocallStrategy.equals("THROW_EXCEPTION")) + return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED")) + return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED; + if (nocallStrategy.equals("PURGE_READ")) + return SOLID_NOCALL_STRATEGY.PURGE_READ; + + throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value"); + } + } + + private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { + List result = new LinkedList(); + int reportTableIndex = 0; + int rowIndex = 0; + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { + + final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + columnNames.add(covariateValue); + columnNames.add(covariateName); + } + } + + columnNames.add(eventType); // the order of these column names is important here + columnNames.add(empiricalQuality); + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) + columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported + columnNames.add(nObservations); + columnNames.add(nErrors); + + final GATKReportTable reportTable; + if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + if(sortByCols) { + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.DO_NOT_SORT); + } + for (final Pair columnName : columnNames) + reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); + rowIndex = 0; // reset the row index since we're starting with a new table + } else { + reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()); + } + + final NestedIntegerArray table = recalibrationTables.getTable(tableIndex); + for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) { + final RecalDatum datum = (RecalDatum)row.value; + final int[] keys = row.keys; + + int columnIndex = 0; + int keyIndex = 0; + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++])); + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++])); + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { + final Covariate covariate = requestedCovariates[tableIndex]; + + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++])); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); + } + } + + final EventType event = EventType.eventFrom(keys[keyIndex]); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString()); + + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); + reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); + + rowIndex++; + } + result.add(reportTable); + } + + return result; + } + + private static String parseCovariateName(final Covariate covariate) { + return covariate.getClass().getSimpleName().split("Covariate")[0]; + } + + /** + * Return a human-readable string representing the used covariates + * + * @param requestedCovariates a vector of covariates + * @return a non-null comma-separated string + */ + public static String covariateNames(final Covariate[] requestedCovariates) { + final List names = new ArrayList(requestedCovariates.length); + for ( final Covariate cov : requestedCovariates ) + names.add(cov.getClass().getSimpleName()); + return Utils.join(",", names); + } + + /** + * Outputs the GATK report to RAC.RECAL_TABLE. + * + * @param RAC The list of shared command line arguments + * @param quantizationInfo Quantization info + * @param recalibrationTables Recalibration tables + * @param requestedCovariates The list of requested covariates + * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT + */ + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { + final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); + report.print(RAC.RECAL_TABLE); + } + + /** + * Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @param argumentTable Argument table + * @param quantizationInfo Quantization info + * @param recalibrationTables Recalibration tables + * @param requestedCovariates The list of requested covariates + * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT + * @return GATK report + */ + public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) { + return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); + } + + /** + * Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @param argumentTable Argument table + * @param quantizationTable Quantization Table + * @param recalTables Other recal tables + * @return GATK report + */ + private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) { + final GATKReport report = new GATKReport(); + report.addTable(argumentTable); + report.addTable(quantizationTable); + report.addTables(recalTables); + return report; + } + + /** s + * Write recalibration plots into a file + * + * @param csvFile location of the intermediary file + * @param exampleReportFile where the report arguments are collected from. + * @param output result plot file name. + */ + public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) { + final RScriptExecutor executor = new RScriptExecutor(); + executor.setExceptOnError(true); + executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); + executor.addArgs(csvFile.getAbsolutePath()); + executor.addArgs(exampleReportFile.getAbsolutePath()); + executor.addArgs(output.getAbsolutePath()); + Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine()); + executor.exec(); + } + + private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) { + + final RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); + executor.addArgs(csvFile.getAbsolutePath()); + executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); + executor.exec(); + } + + /** + * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. + * + * @deprecated + */ + @Deprecated + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { + generateRecalibrationPlot(RAC, original, null, requestedCovariates); + } + + /** + * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. + * + * @deprecated + */ + @Deprecated + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { + final PrintStream csvStream; + final File csvTempFile = null; + try { + File csvTmpFile = File.createTempFile("BQSR",".csv"); + csvTmpFile.deleteOnExit(); + csvStream = new PrintStream(csvTmpFile); + } catch (IOException e) { + throw new UserException("Could not create temporary csv file", e); + } + + if ( recalibrated != null ) + writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null); + csvStream.close(); + outputRecalibrationPlot(csvTempFile, RAC); + csvTempFile.delete(); + } + + private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { + + final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length); + + // add the quality score table to the delta table + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table + final int[] newCovs = new int[4]; + newCovs[0] = leaf.keys[0]; + newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore + newCovs[2] = leaf.keys[1]; + newCovs[3] = leaf.keys[2]; + addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table + } + + // add the optional covariates to the delta table + for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) { + final NestedIntegerArray covTable = recalibrationTables.getTable(i); + for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { + final int[] covs = new int[4]; + covs[0] = leaf.keys[0]; + covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + covs[2] = leaf.keys[2]; + covs[3] = leaf.keys[3]; + addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table + } + } + + // output the csv file + if (printHeader) { + printHeader(deltaTableFile); + } + + final Map covariateNameMap = new HashMap(requestedCovariates.length); + for (final Covariate covariate : requestedCovariates) + covariateNameMap.put(covariate, parseCovariateName(covariate)); + + // print each data line + for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) { + final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); + final RecalDatum deltaDatum = leaf.value; + deltaTableFile.print(Utils.join(",", deltaKeys)); + deltaTableFile.print("," + deltaDatum.stringForCSV()); + deltaTableFile.println("," + recalibrationMode); + } + } + + private static void printHeader(PrintStream out) { + final List header = new LinkedList(); + header.add("ReadGroup"); + header.add("CovariateValue"); + header.add("CovariateName"); + header.add("EventType"); + header.add("Observations"); + header.add("Errors"); + header.add("EmpiricalQuality"); + header.add("AverageReportedQuality"); + header.add("Accuracy"); + header.add("Recalibration"); + out.println(Utils.join(",", header)); + } + + /* + * Return an initialized nested integer array with appropriate dimensions for use with the delta tables + * + * @param recalibrationTables the recal tables + * @param numCovariates the total number of covariates being used + * @return a non-null nested integer array + */ + @Requires("recalibrationTables != null && numCovariates > 0") + @Ensures("result != null") + private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) { + + final int[] dimensionsForDeltaTable = new int[4]; + + // initialize the dimensions with those of the qual table to start with + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + final int[] dimensionsOfQualTable = qualTable.getDimensions(); + dimensionsForDeltaTable[0] = dimensionsOfQualTable[0]; // num read groups + dimensionsForDeltaTable[1] = numCovariates + 1; // num covariates + dimensionsForDeltaTable[2] = dimensionsOfQualTable[1]; + dimensionsForDeltaTable[3] = dimensionsOfQualTable[2]; + + // now, update the dimensions based on the optional covariate tables as needed + for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) { + final NestedIntegerArray covTable = recalibrationTables.getTable(i); + final int[] dimensionsOfCovTable = covTable.getDimensions(); + dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]); + dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]); + } + + return new NestedIntegerArray(dimensionsForDeltaTable); + } + + protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) { + final List values = new ArrayList(4); + values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0])); + + final int covariateIndex = keys[1]; + final int covariateKey = keys[2]; + final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; + values.add(covariate.formatKey(covariateKey)); + values.add(covariateNameMap.get(covariate)); + values.add(EventType.eventFrom(keys[3]).prettyPrint()); + + return values; + } + + /** + * Updates the current RecalDatum element in the delta table. + * + * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. + * + * @param deltaTable the delta table + * @param deltaKey the key to the table + * @param recalDatum the recal datum to combine with the accuracyDatum element in the table + */ + private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { + final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key + if (deltaDatum == null) + // if we don't have a key yet, create a new one with the same values as the current datum + deltaTable.put(new RecalDatum(recalDatum), deltaKey); + else + // if we do have a datum, combine it with this one + deltaDatum.combine(recalDatum); + } + + /** + * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string + * + * @param read The read to adjust + * @param RAC The list of shared command line arguments + */ + public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { + GATKSAMReadGroupRecord readGroup = read.getReadGroup(); + + if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { + readGroup.setPlatform(RAC.FORCE_PLATFORM); + } + + if (readGroup.getPlatform() == null) { + if (RAC.DEFAULT_PLATFORM != null) { + if (!warnUserNullPlatform) { + Utils.warnUser("The input .bam file contains reads with no platform information. " + + "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + + "First observed at read with name = " + read.getReadName()); + warnUserNullPlatform = true; + } + readGroup.setPlatform(RAC.DEFAULT_PLATFORM); + } + else { + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); + } + } + } + + /** + * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are + * inconsistent with the color space. If there is a no call in the color space, this method returns false meaning + * this read should be skipped + * + * @param strategy the strategy used for SOLID no calls + * @param read The SAMRecord to parse + * @return true if this read is consistent or false if this read should be skipped + */ + public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { + if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + return true; + + // Haven't calculated the inconsistency array yet for this read + if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); + if (attr != null) { + byte[] colorSpace; + if (attr instanceof String) + colorSpace = ((String) attr).getBytes(); + else + throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + + final boolean badColor = hasNoCallInColorSpace(colorSpace); + if (badColor) { + if (strategy == SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { + return false; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them + } + else if (strategy == SOLID_NOCALL_STRATEGY.PURGE_READ) { + read.setReadFailsVendorQualityCheckFlag(true); + return false; + } + } + + byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + if (read.getReadNegativeStrandFlag()) + readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); + + final byte[] inconsistency = new byte[readBases.length]; + int i; + byte prevBase = colorSpace[0]; // The sentinel + for (i = 0; i < readBases.length; i++) { + final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); + inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); + prevBase = readBases[i]; + } + read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); + } + else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it + throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); + + else + return false; // otherwise, just skip the read + } + + return true; + } + + private static boolean hasNoCallInColorSpace(final byte[] colorSpace) { + final int length = colorSpace.length; + for (int i = 1; i < length; i++) { // skip the sentinal + final byte color = colorSpace[i]; + if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { + return true; // There is a bad color in this SOLiD read + } + } + + return false; // There aren't any color no calls in this SOLiD read + } + + /** + * Given the base and the color calculate the next base in the sequence + * + * @param read the read + * @param prevBase The base + * @param color The color + * @return The next base in the sequence + */ + private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { + switch (color) { + case '0': + return prevBase; + case '1': + return performColorOne(prevBase); + case '2': + return performColorTwo(prevBase); + case '3': + return performColorThree(prevBase); + default: + throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + + " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); + } + } + + /** + * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality + * + * @param read The read which contains the color space to check against + * @param offset The offset in the read at which to check + * @return Returns true if the base was inconsistent with the color space + */ + public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG); + if (attr != null) { + final byte[] inconsistency = (byte[]) attr; + // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! + if (read.getReadNegativeStrandFlag()) { // Negative direction + return inconsistency[inconsistency.length - offset - 1] == (byte) 0; + } + else { // Forward direction + return inconsistency[offset] == (byte) 0; + } + + // This block of code is for if you want to check both the offset and the next base for color space inconsistency + //if( read.getReadNegativeStrandFlag() ) { // Negative direction + // if( offset == 0 ) { + // return inconsistency[0] != 0; + // } else { + // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); + // } + //} else { // Forward direction + // if( offset == inconsistency.length - 1 ) { + // return inconsistency[inconsistency.length - 1] != 0; + // } else { + // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); + // } + //} + + } + else { // No inconsistency array, so nothing is inconsistent + return true; + } + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * + * @param read The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @return a matrix with all the covariates calculated for every base in the read + */ + public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) { + final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length); + computeCovariates(read, requestedCovariates, readCovariates); + return readCovariates; + } + + /** + * Computes all requested covariates for every offset in the given read + * by calling covariate.getValues(..). + * + * It populates an array of covariate values where result[i][j] is the covariate + * value for the ith position in the read and the jth covariate in + * reqeustedCovariates list. + * + * @param read The read for which to compute covariate values. + * @param requestedCovariates The list of requested covariates. + * @param resultsStorage The object to store the covariate values + */ + public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { + // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read + for (int i = 0; i < requestedCovariates.length; i++) { + resultsStorage.setCovariateIndex(i); + requestedCovariates[i].recordValues(read, resultsStorage); + } + } + + /** + * Perform a certain transversion (A <-> C or G <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transversion of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorOne(byte base) { + switch (base) { + case 'A': + case 'a': + return 'C'; + case 'C': + case 'c': + return 'A'; + case 'G': + case 'g': + return 'T'; + case 'T': + case 't': + return 'G'; + default: + return base; + } + } + + /** + * Perform a transition (A <-> G or C <-> T) on the base. + * + * @param base the base [AaCcGgTt] + * @return the transition of the base, or the input base if it's not one of the understood ones + */ + private static byte performColorTwo(byte base) { + switch (base) { + case 'A': + case 'a': + return 'G'; + case 'C': + case 'c': + return 'T'; + case 'G': + case 'g': + return 'A'; + case 'T': + case 't': + return 'C'; + default: + return base; + } + } + + /** + * Return the complement (A <-> T or C <-> G) of a base. + * + * @param base the base [AaCcGgTt] + * @return the complementary base, or the input base if it's not one of the understood ones + */ + private static byte performColorThree(byte base) { + switch (base) { + case 'A': + case 'a': + return 'T'; + case 'C': + case 'c': + return 'G'; + case 'G': + case 'g': + return 'C'; + case 'T': + case 't': + return 'A'; + default: + return base; + } + } + + /** + * Combines the recalibration data for table1 and table2 into table1 + * + * Note that table1 is the destination, so it is modified + * + * @param table1 the destination table to merge table2 into + * @param table2 the source table to merge into table1 + */ + public static void combineTables(final NestedIntegerArray table1, final NestedIntegerArray table2) { + if ( table1 == null ) throw new IllegalArgumentException("table1 cannot be null"); + if ( table2 == null ) throw new IllegalArgumentException("table2 cannot be null"); + if ( ! Arrays.equals(table1.getDimensions(), table2.getDimensions())) + throw new IllegalArgumentException("Table1 " + Utils.join(",", table1.getDimensions()) + " not equal to " + Utils.join(",", table2.getDimensions())); + + for (final NestedIntegerArray.Leaf row : table2.getAllLeaves()) { + final RecalDatum myDatum = table1.get(row.keys); + + if (myDatum == null) + table1.put(row.value, row.keys); + else + myDatum.combine(row.value); + } + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + public static void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1L, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1L, isError); + } + } + + /** + * creates a datum object with one observation and one or zero error + * + * @param reportedQual the quality score reported by the instrument for this base + * @param isError whether or not the observation is an error + * @return a new RecalDatum object with the observation and the error + */ + private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java new file mode 100644 index 000000000..1b2129f3d --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationArgumentCollection.java @@ -0,0 +1,419 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Requires; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.GATKException; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 27, 2009 + * + * A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker. + * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. + */ + +public class RecalibrationArgumentCollection implements Cloneable { + + /** + * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, + * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) + * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. + * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. + */ + @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) + public List> knownSites = Collections.emptyList(); + + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. + */ + @Gather(BQSRGatherer.class) + @Output(doc = "The output recalibration table file to create", required = true) + public File RECAL_TABLE_FILE = null; + public PrintStream RECAL_TABLE; + + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ + @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) + public boolean LIST_ONLY = false; + + /** + * Note that the ReadGroup and QualityScore covariates are required and do not need to be specified. + * Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default. + * Use the --list argument to see the available covariates. + */ + @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false) + public String[] COVARIATES = null; + + /* + * The Cycle and Context covariates are standard and are included by default unless this argument is provided. + * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded. + */ + @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) + public boolean DO_NOT_USE_STANDARD_COVARIATES = false; + + /** + * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. + */ + @Advanced + @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") + public boolean RUN_WITHOUT_DBSNP = false; + + /** + * BaseRecalibrator accepts a --solid_recal_mode flag which governs how the recalibrator handles the + * reads which have had the reference inserted because of color space inconsistencies. + */ + @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") + public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; + + /** + * BaseRecalibrator accepts a --solid_nocall_strategy flag which governs how the recalibrator handles + * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in + * their color space tag can not be recalibrated. + */ + @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) + public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + + /** + * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. + */ + @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false) + public int MISMATCHES_CONTEXT_SIZE = 2; + + /** + * The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. + */ + @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false) + public int INDELS_CONTEXT_SIZE = 3; + + /** + * The cycle covariate will generate an error if it encounters a cycle greater than this value. + * This argument is ignored if the Cycle covariate is not used. + */ + @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false) + public int MAXIMUM_CYCLE_VALUE = 500; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off] + */ + @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) + public byte MISMATCHES_DEFAULT_QUALITY = -1; + + /** + * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on] + */ + @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) + public byte INSERTIONS_DEFAULT_QUALITY = 45; + + /** + * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on] + */ + @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) + public byte DELETIONS_DEFAULT_QUALITY = 45; + + /** + * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality + */ + @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) + public byte LOW_QUAL_TAIL = 2; + + /** + * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. + * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. + */ + @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") + public int QUANTIZING_LEVELS = 16; + + /** + * The tag name for the binary tag covariate (if using it) + */ + @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") + public String BINARY_TAG_NAME = null; + + /* + * whether GATK report tables should have rows in sorted order, starting from leftmost column + */ + @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) + public Boolean SORT_BY_ALL_COLUMNS = false; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + + @Hidden + @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") + public String DEFAULT_PLATFORM = null; + + @Hidden + @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") + public String FORCE_PLATFORM = null; + + @Hidden + @Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.") + public String FORCE_READGROUP = null; + + @Hidden + @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false) + public PrintStream RECAL_TABLE_UPDATE_LOG = null; + + /** + * The repeat covariate will use a context of this size to calculate it's covariate value for base insertions and deletions + */ + @Hidden + @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false) + public int MAX_STR_UNIT_LENGTH = 8; + + @Hidden + @Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false) + public int MAX_REPEAT_LENGTH = 20; + + + public File existingRecalibrationReport = null; + + public GATKReportTable generateReportTable(final String covariateNames) { + GATKReportTable argumentsTable; + if(SORT_BY_ALL_COLUMNS) { + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); + } else { + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); + } + argumentsTable.addColumn("Argument"); + argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); + argumentsTable.addRowID("covariate", true); + argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames); + argumentsTable.addRowID("no_standard_covs", true); + argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); + argumentsTable.addRowID("run_without_dbsnp", true); + argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); + argumentsTable.addRowID("solid_recal_mode", true); + argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); + argumentsTable.addRowID("solid_nocall_strategy", true); + argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); + argumentsTable.addRowID("mismatches_context_size", true); + argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); + argumentsTable.addRowID("indels_context_size", true); + argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); + argumentsTable.addRowID("mismatches_default_quality", true); + argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); + argumentsTable.addRowID("deletions_default_quality", true); + argumentsTable.set("deletions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_DEFAULT_QUALITY); + argumentsTable.addRowID("insertions_default_quality", true); + argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); + argumentsTable.addRowID("maximum_cycle_value", true); + argumentsTable.set("maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE); + argumentsTable.addRowID("low_quality_tail", true); + argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); + argumentsTable.addRowID("default_platform", true); + argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); + argumentsTable.addRowID("force_platform", true); + argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); + argumentsTable.addRowID("quantizing_levels", true); + argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); + argumentsTable.addRowID("recalibration_report", true); + argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); + argumentsTable.addRowID("binary_tag_name", true); + argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); + return argumentsTable; + } + + /** + * Returns a map with the arguments that differ between this an + * another {@link RecalibrationArgumentCollection} instance. + *

+ * The key is the name of that argument in the report file. The value is a message + * that explains the difference to the end user. + *

+ * Thus, a empty map indicates that there is no differences between both argument collection that + * is relevant to report comparison. + *

+ * This method should not throw any exception. + * + * @param other the argument-collection to compare against. + * @param thisRole the name used to refer to this RAC report that makes sense to the end user. + * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. + * + * @return never null, but a zero-size collection if there are no differences. + */ + @Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)") + public Map compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) { + final Map result = new LinkedHashMap<>(15); + compareRequestedCovariates(result, other, thisRole, otherRole); + compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole); + compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole); + compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole); + compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole); + compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole); + compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole); + compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole); + compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole); + compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole); + compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole); + compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole); + compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole); + compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole); + compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole); + return result; + } + + + /** + * Compares the covariate report lists. + * + * @param diffs map where to annotate the difference. + * @param other the argument collection to compare against. + * @param thisRole the name for this argument collection that makes sense to the user. + * @param otherRole the name for the other argument collection that makes sense to the end user. + * + * @return true if a difference was found. + */ + @Requires("diffs != null && other != null && thisRole != null && otherRole != null") + private boolean compareRequestedCovariates(final Map diffs, + final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) { + + final Set beforeNames = new HashSet<>(this.COVARIATES.length); + final Set afterNames = new HashSet<>(other.COVARIATES.length); + Utils.addAll(beforeNames, this.COVARIATES); + Utils.addAll(afterNames,other.COVARIATES); + final Set intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size())); + intersect.addAll(beforeNames); + intersect.retainAll(afterNames); + + String diffMessage = null; + if (intersect.size() == 0) { // In practice this is not possible due to required covariates but... + diffMessage = String.format("There are no common covariates between '%s' and '%s'" + + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole, + thisRole,Utils.join(", ",this.COVARIATES), + otherRole,Utils.join(",",other.COVARIATES)); + } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) { + beforeNames.removeAll(intersect); + afterNames.removeAll(intersect); + diffMessage = String.format("There are differences in the set of covariates requested in the" + + " '%s' and '%s' recalibrator reports. " + + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole, + thisRole,Utils.join(", ",beforeNames), + otherRole,Utils.join(", ",afterNames)); + } + if (diffMessage != null) { + diffs.put("covariate",diffMessage); + return true; + } else { + return false; + } + } + + /** + * Annotates a map with any difference encountered in a simple value report argument that differs between this an + * another {@link RecalibrationArgumentCollection} instance. + *

+ * The key of the new entry would be the name of that argument in the report file. The value is a message + * that explains the difference to the end user. + *

+ * + *

+ * This method should not return any exception. + * + * @param diffs where to annotate the differences. + * @param name the name of the report argument to compare. + * @param thisValue this argument collection value for that argument. + * @param otherValue the other collection value for that argument. + * @param thisRole the name used to refer to this RAC report that makes sense to the end user. + * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. + * + * @type T the argument Object value type. + * + * @return true if a difference has been spotted, thus diff has been modified. + */ + private boolean compareSimpleReportArgument(final Map diffs, + final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) { + if (thisValue == null && otherValue == null) { + return false; + } else if (thisValue != null && thisValue.equals(otherValue)) { + return false; + } else { + diffs.put(name, + String.format("differences between '%s' {%s} and '%s' {%s}.", + thisRole,thisValue == null ? "" : thisValue, + otherRole,otherValue == null ? "" : otherValue)); + return true; + } + + } + + /** + * Create a shallow copy of this argument collection. + * + * @return never null. + */ + @Override + public RecalibrationArgumentCollection clone() { + try { + return (RecalibrationArgumentCollection) super.clone(); + } catch (CloneNotSupportedException e) { + throw new GATKException("Unreachable code clone not supported thrown when the class " + + this.getClass().getName() + " is cloneable ",e); + } + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReport.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReport.java new file mode 100644 index 000000000..a9b401c2b --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReport.java @@ -0,0 +1,425 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; + +import java.io.*; +import java.util.*; + +/** + * This class has all the static functionality for reading a recalibration report file into memory. + * + * @author carneiro + * @since 3/26/12 + */ +public class RecalibrationReport { + private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) + private final RecalibrationTables recalibrationTables; // quick access reference to the tables + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private final HashMap optionalCovariateIndexes; + + private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes + private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter + + private final int[] tempRGarray = new int[2]; + private final int[] tempQUALarray = new int[3]; + private final int[] tempCOVarray = new int[4]; + + public RecalibrationReport(final File recalFile) { + this(recalFile, getReadGroups(recalFile)); + } + + public RecalibrationReport(final File recalFile, final SortedSet allReadGroups) { + final GATKReport report = new GATKReport(recalFile); + + argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); + RAC = initializeArgumentCollectionTable(argumentTable); + + GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); + quantizationInfo = initializeQuantizationTable(quantizedTable); + + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + ArrayList requiredCovariates = covariates.getFirst(); + ArrayList optionalCovariates = covariates.getSecond(); + requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + optionalCovariateIndexes = new HashMap(optionalCovariates.size()); + int covariateIndex = 0; + for (final Covariate covariate : requiredCovariates) + requestedCovariates[covariateIndex++] = covariate; + for (final Covariate covariate : optionalCovariates) { + requestedCovariates[covariateIndex] = covariate; + final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport + optionalCovariateIndexes.put(covariateName, covariateIndex-2); + covariateIndex++; + } + + for (Covariate cov : requestedCovariates) + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + + recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size()); + + initializeReadGroupCovariates(allReadGroups); + + parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable()); + + parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getQualityScoreTable()); + + parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables); + + } + + /** + * Gets the unique read groups in the recal file + * + * @param recalFile the recal file as a GATK Report + * @return the unique read groups + */ + public static SortedSet getReadGroups(final File recalFile) { + return getReadGroups(new GATKReport(recalFile)); + } + + /** + * Gets the unique read groups in the table + * + * @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE + * @return the unique read groups + */ + private static SortedSet getReadGroups(final GATKReport report) { + final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); + final SortedSet readGroups = new TreeSet(); + for ( int i = 0; i < reportTable.getNumRows(); i++ ) + readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString()); + return readGroups; + } + + /** + * Combines two recalibration reports by adding all observations and errors + * + * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate + * them after combining. The reason for not calculating it is because this function is intended for combining a + * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized + * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, + * makes this method faster + * + * Note2: The empirical quality reported, however, is recalculated given its simplicity. + * + * @param other the recalibration report to combine with this one + */ + public void combine(final RecalibrationReport other) { + for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) { + final NestedIntegerArray myTable = recalibrationTables.getTable(tableIndex); + final NestedIntegerArray otherTable = other.recalibrationTables.getTable(tableIndex); + RecalUtils.combineTables(myTable, otherTable); + } + } + + public QuantizationInfo getQuantizationInfo() { + return quantizationInfo; + } + + public RecalibrationTables getRecalibrationTables() { + return recalibrationTables; + } + + public Covariate[] getRequestedCovariates() { + return requestedCovariates; + } + + /** + * Initialize read group keys using the shared list of all the read groups. + * + * By using the same sorted set of read groups across all recalibration reports, even if + * one report is missing a read group, all the reports use the same read group keys. + * + * @param allReadGroups The list of all possible read groups + */ + private void initializeReadGroupCovariates(final SortedSet allReadGroups) { + for (String readGroup: allReadGroups) { + requestedCovariates[0].keyFromValue(readGroup); + } + } + + /** + * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table + * + * @param reportTable the GATKReport table containing data for this table + * @param recalibrationTables the recalibration tables +\ */ + private void parseAllCovariatesTable(final GATKReportTable reportTable, final RecalibrationTables recalibrationTables) { + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); + tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); + tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual); + + final String covName = (String)reportTable.get(i, RecalUtils.COVARIATE_NAME_COLUMN_NAME); + final int covIndex = optionalCovariateIndexes.get(covName); + final Object covValue = reportTable.get(i, RecalUtils.COVARIATE_VALUE_COLUMN_NAME); + tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex].keyFromValue(covValue); + + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); + tempCOVarray[3] = event.ordinal(); + + recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); + } + } + + /** + * + * Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table + * @param reportTable the GATKReport table containing data for this table + * @param qualTable the map representing this table + */ + private void parseQualityScoreTable(final GATKReportTable reportTable, final NestedIntegerArray qualTable) { + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); + tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg); + final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); + tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); + tempQUALarray[2] = event.ordinal(); + + qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); + } + } + + /** + * Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table + * + * @param reportTable the GATKReport table containing data for this table + * @param rgTable the map representing this table + */ + private void parseReadGroupTable(final GATKReportTable reportTable, final NestedIntegerArray rgTable) { + for ( int i = 0; i < reportTable.getNumRows(); i++ ) { + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); + tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); + tempRGarray[1] = event.ordinal(); + + rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); + } + } + + private double asDouble(final Object o) { + if ( o instanceof Double ) + return (Double)o; + else if ( o instanceof Integer ) + return (Integer)o; + else if ( o instanceof Long ) + return (Long)o; + else + throw new ReviewedGATKException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass()); + } + + private long asLong(final Object o) { + if ( o instanceof Long ) + return (Long)o; + else if ( o instanceof Integer ) + return ((Integer)o).longValue(); + else if ( o instanceof Double ) + return ((Double)o).longValue(); + else + throw new ReviewedGATKException("Object " + o + " is expected to be a long but it's not: " + o.getClass()); + } + + private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { + final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); + final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); + //final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); + + // the estimatedQreported column only exists in the ReadGroup table + final double estimatedQReported = hasEstimatedQReportedColumn ? + (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + + final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1); + datum.setEstimatedQReported(estimatedQReported); + //datum.setEmpiricalQuality(empiricalQuality); // don't set the value here because we will want to recompute with a different conditional Q score prior value + return datum; + } + + /** + * Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores + * + * @param table the GATKReportTable containing the quantization mappings + * @return an ArrayList with the quantization mappings from 0 to MAX_SAM_QUAL_SCORE + */ + private QuantizationInfo initializeQuantizationTable(GATKReportTable table) { + final Byte[] quals = new Byte[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; + final Long[] counts = new Long[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; + for ( int i = 0; i < table.getNumRows(); i++ ) { + final byte originalQual = (byte)i; + final Object quantizedObject = table.get(i, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); + final Object countObject = table.get(i, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); + final byte quantizedQual = Byte.parseByte(quantizedObject.toString()); + final long quantizedCount = Long.parseLong(countObject.toString()); + quals[originalQual] = quantizedQual; + counts[originalQual] = quantizedCount; + } + return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts)); + } + + /** + * Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values + * + * @param table the GATKReportTable containing the arguments and its corresponding values + * @return a RAC object properly initialized with all the objects in the table + */ + private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + for ( int i = 0; i < table.getNumRows(); i++ ) { + final String argument = table.get(i, "Argument").toString(); + Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); + if (value.equals("null")) + value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport + + if (argument.equals("covariate") && value != null) + RAC.COVARIATES = value.toString().split(","); + + else if (argument.equals("standard_covs")) + RAC.DO_NOT_USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value); + + else if (argument.equals("solid_recal_mode")) + RAC.SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.recalModeFromString((String) value); + + else if (argument.equals("solid_nocall_strategy")) + RAC.SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value); + + else if (argument.equals("mismatches_context_size")) + RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value); + + else if (argument.equals("indels_context_size")) + RAC.INDELS_CONTEXT_SIZE = Integer.parseInt((String) value); + + else if (argument.equals("mismatches_default_quality")) + RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (argument.equals("insertions_default_quality")) + RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (argument.equals("deletions_default_quality")) + RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); + + else if (argument.equals("maximum_cycle_value")) + RAC.MAXIMUM_CYCLE_VALUE = Integer.parseInt((String) value); + + else if (argument.equals("low_quality_tail")) + RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value); + + else if (argument.equals("default_platform")) + RAC.DEFAULT_PLATFORM = (String) value; + + else if (argument.equals("force_platform")) + RAC.FORCE_PLATFORM = (String) value; + + else if (argument.equals("quantizing_levels")) + RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); + + else if (argument.equals("recalibration_report")) + RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value); + + else if (argument.equals("binary_tag_name")) + RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value; + + else if (argument.equals("sort_by_all_columns")) + RAC.SORT_BY_ALL_COLUMNS = Boolean.parseBoolean((String) value); + } + + return RAC; + } + + /** + * this functionality avoids recalculating the empirical qualities, estimated reported quality + * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. + */ + public void calculateQuantizedQualities() { + quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); + } + + /** + * Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream). + * + * @return newly created recalibration report + */ + public GATKReport createGATKReport() { + return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); + } + + public RecalibrationArgumentCollection getRAC() { + return RAC; + } + + /** + * + * @deprecated use {@link #getRequestedCovariates()} instead. + */ + @Deprecated + public Covariate[] getCovariates() { + return requestedCovariates; + } + + /** + * @return true if the report has no data + */ + public boolean isEmpty() { + return recalibrationTables.isEmpty(); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTables.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTables.java new file mode 100644 index 000000000..ad227f9bd --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTables.java @@ -0,0 +1,169 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Ensures; +import org.broadinstitute.gatk.utils.collections.LoggingNestedIntegerArray; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; + +import java.io.PrintStream; +import java.util.ArrayList; + +/** + * Utility class to facilitate on-the-fly base quality score recalibration. + * + * User: ebanks + * Date: 6/20/12 + */ + +public final class RecalibrationTables { + public enum TableType { + READ_GROUP_TABLE, + QUALITY_SCORE_TABLE, + OPTIONAL_COVARIATE_TABLES_START; + } + + private final ArrayList> tables; + private final int qualDimension; + private final int eventDimension = EventType.values().length; + private final int numReadGroups; + private final PrintStream log; + + public RecalibrationTables(final Covariate[] covariates) { + this(covariates, covariates[TableType.READ_GROUP_TABLE.ordinal()].maximumKeyValue() + 1, null); + } + + public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { + this(covariates, numReadGroups, null); + } + + public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { + tables = new ArrayList>(covariates.length); + for ( int i = 0; i < covariates.length; i++ ) + tables.add(i, null); // initialize so we can set below + + qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.ordinal()].maximumKeyValue() + 1; + this.numReadGroups = numReadGroups; + this.log = log; + + tables.set(TableType.READ_GROUP_TABLE.ordinal(), + log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : + new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension)); + + tables.set(TableType.QUALITY_SCORE_TABLE.ordinal(), makeQualityScoreTable()); + + for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < covariates.length; i++) + tables.set(i, + log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : + new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + 1), + numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension)); + } + + @Ensures("result != null") + public NestedIntegerArray getReadGroupTable() { + return getTable(TableType.READ_GROUP_TABLE.ordinal()); + } + + @Ensures("result != null") + public NestedIntegerArray getQualityScoreTable() { + return getTable(TableType.QUALITY_SCORE_TABLE.ordinal()); + } + + @Ensures("result != null") + public NestedIntegerArray getTable(final int index) { + return tables.get(index); + } + + @Ensures("result >= 0") + public int numTables() { + return tables.size(); + } + + /** + * @return true if all the tables contain no RecalDatums + */ + public boolean isEmpty() { + for( final NestedIntegerArray table : tables ) { + if( !table.getAllValues().isEmpty() ) { return false; } + } + return true; + } + + /** + * Allocate a new quality score table, based on requested parameters + * in this set of tables, without any data in it. The return result + * of this table is suitable for acting as a thread-local cache + * for quality score values + * @return a newly allocated, empty read group x quality score table + */ + public NestedIntegerArray makeQualityScoreTable() { + return log == null + ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) + : new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); + } + + /** + * Merge all of the tables from toMerge into into this set of tables + */ + public void combine(final RecalibrationTables toMerge) { + if ( numTables() != toMerge.numTables() ) + throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes"); + + for ( int i = 0; i < numTables(); i++ ) { + final NestedIntegerArray myTable = this.getTable(i); + final NestedIntegerArray otherTable = toMerge.getTable(i); + RecalUtils.combineTables(myTable, otherTable); + } + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ContextCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ContextCovariate.java new file mode 100644 index 000000000..f1ef944dc --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ContextCovariate.java @@ -0,0 +1,304 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; +import org.broadinstitute.gatk.utils.clipping.ReadClipper; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: 9/26/11 + */ + +public class ContextCovariate implements StandardCovariate { + private final static Logger logger = Logger.getLogger(ContextCovariate.class); + + + + private int mismatchesContextSize; + private int indelsContextSize; + + private int mismatchesKeyMask; + private int indelsKeyMask; + + private static final int LENGTH_BITS = 4; + private static final int LENGTH_MASK = 15; + + // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are + // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. + static final private int MAX_DNA_CONTEXT = 13; + private byte LOW_QUAL_TAIL; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE; + indelsContextSize = RAC.INDELS_CONTEXT_SIZE; + + logger.info("\t\tContext sizes: base substitution model " + mismatchesContextSize + ", indel substitution model " + indelsContextSize); + + if (mismatchesContextSize > MAX_DNA_CONTEXT) + throw new UserException.BadArgumentValue("mismatches_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, mismatchesContextSize)); + if (indelsContextSize > MAX_DNA_CONTEXT) + throw new UserException.BadArgumentValue("indels_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, indelsContextSize)); + + LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL; + + if (mismatchesContextSize <= 0 || indelsContextSize <= 0) + throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize)); + + mismatchesKeyMask = createMask(mismatchesContextSize); + indelsKeyMask = createMask(indelsContextSize); + } + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + + // store the original bases and then write Ns over low quality ones + final byte[] originalBases = read.getReadBases().clone(); + // Write N's over the low quality tail of the reads to avoid adding them into the context + final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); + + final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); + byte[] bases = clippedRead.getReadBases(); + if (negativeStrand) + bases = BaseUtils.simpleReverseComplement(bases); + + final ArrayList mismatchKeys = contextWith(bases, mismatchesContextSize, mismatchesKeyMask); + final ArrayList indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask); + + final int readLength = bases.length; + + // this is necessary to ensure that we don't keep historical data in the ReadCovariates values + // since the context covariate may not span the entire set of values in read covariates + // due to the clipping of the low quality bases + if ( readLength != originalBases.length ) { + // don't both zeroing out if we are going to overwrite the whole array + for ( int i = 0; i < originalBases.length; i++ ) + // this base has been clipped off, so zero out the covariate values here + values.addCovariate(0, 0, 0, i); + } + + for (int i = 0; i < readLength; i++) { + final int readOffset = (negativeStrand ? readLength - i - 1 : i); + final int indelKey = indelKeys.get(i); + values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset); + } + + // put the original bases back in + read.setReadBases(originalBases); + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return str; + } + + @Override + public String formatKey(final int key) { + if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file + return null; + + return contextFromKey(key); + } + + @Override + public int keyFromValue(final Object value) { + return keyFromContext((String) value); + } + + private static int createMask(final int contextSize) { + int mask = 0; + // create 2*contextSize worth of bits + for (int i = 0; i < contextSize; i++) + mask = (mask << 2) | 3; + // shift 4 bits to mask out the bits used to encode the length + return mask << LENGTH_BITS; + } + + /** + * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) + * + * @param bases the bases in the read to build the context from + * @param contextSize context size to use building the context + * @param mask mask for pulling out just the context bits + */ + private static ArrayList contextWith(final byte[] bases, final int contextSize, final int mask) { + + final int readLength = bases.length; + final ArrayList keys = new ArrayList(readLength); + + // the first contextSize-1 bases will not have enough previous context + for (int i = 1; i < contextSize && i <= readLength; i++) + keys.add(-1); + + if (readLength < contextSize) + return keys; + + final int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; + + // get (and add) the key for the context starting at the first base + int currentKey = keyFromContext(bases, 0, contextSize); + keys.add(currentKey); + + // if the first key was -1 then there was an N in the context; figure out how many more consecutive contexts it affects + int currentNPenalty = 0; + if (currentKey == -1) { + currentKey = 0; + currentNPenalty = contextSize - 1; + int offset = newBaseOffset; + while (bases[currentNPenalty] != 'N') { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentNPenalty]); + currentKey |= (baseIndex << offset); + offset -= 2; + currentNPenalty--; + } + } + + for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); + if (baseIndex == -1) { // ignore non-ACGT bases + currentNPenalty = contextSize; + currentKey = 0; // reset the key + } else { + // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in + currentKey = (currentKey >> 2) & mask; + currentKey |= (baseIndex << newBaseOffset); + currentKey |= contextSize; + } + + if (currentNPenalty == 0) { + keys.add(currentKey); + } else { + currentNPenalty--; + keys.add(-1); + } + } + + return keys; + } + + public static int keyFromContext(final String dna) { + return keyFromContext(dna.getBytes(), 0, dna.length()); + } + + /** + * Creates a int representation of a given dna string. + * + * @param dna the dna sequence + * @param start the start position in the byte array (inclusive) + * @param end the end position in the array (exclusive) + * @return the key representing the dna sequence + */ + private static int keyFromContext(final byte[] dna, final int start, final int end) { + + int key = end - start; + int bitOffset = LENGTH_BITS; + for (int i = start; i < end; i++) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); + if (baseIndex == -1) // ignore non-ACGT bases + return -1; + key |= (baseIndex << bitOffset); + bitOffset += 2; + } + return key; + } + + /** + * Converts a key into the dna string representation. + * + * @param key the key representing the dna sequence + * @return the dna sequence represented by the key + */ + public static String contextFromKey(final int key) { + if (key < 0) + throw new ReviewedGATKException("dna conversion cannot handle negative numbers. Possible overflow?"); + + final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases + int offset = LENGTH_BITS; + + StringBuilder dna = new StringBuilder(); + for (int i = 0; i < length; i++) { + final int baseIndex = (key & mask) >> offset; + dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); + mask = mask << 2; // move the mask over to the next 2 bits + offset += 2; + } + + return dna.toString(); + } + + @Override + public int maximumKeyValue() { + // the maximum value is T (11 in binary) for each base in the context + int length = Math.max(mismatchesContextSize, indelsContextSize); // the length of the context + int key = length; + int bitOffset = LENGTH_BITS; + for (int i = 0; i DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); + private static final EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE; + + if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM)) + throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform."); + + if (RAC.DEFAULT_PLATFORM != null) + default_platform = RAC.DEFAULT_PLATFORM; + } + + // Used to pick out the covariate's value from attributes of the read + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + final int readLength = read.getReadLength(); + final NGSPlatform ngsPlatform = default_platform == null ? read.getNGSPlatform() : NGSPlatform.fromReadGroupPL(default_platform); + + // Discrete cycle platforms + if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { + final int readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? -1 : 1; + final int increment; + int cycle; + if (read.getReadNegativeStrandFlag()) { + cycle = readLength * readOrderFactor; + increment = -1 * readOrderFactor; + } + else { + cycle = readOrderFactor; + increment = readOrderFactor; + } + + final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1; + for (int i = 0; i < readLength; i++) { + final int substitutionKey = keyFromCycle(cycle); + final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey; + values.addCovariate(substitutionKey, indelKey, indelKey, i); + cycle += increment; + } + } + + // Flow cycle platforms + else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) { + + final byte[] bases = read.getReadBases(); + + // Differentiate between first and second of pair. + // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group + // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. + // Therefore the cycle covariate must differentiate between first and second of pair reads. + // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because + // the current sequential model would consider the effects independently instead of jointly. + final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); + + int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. + + // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change + // For example, AAAAAAA was probably read in two flow cycles but here we count it as one + if (!read.getReadNegativeStrandFlag()) { // Forward direction + int iii = 0; + while (iii < readLength) { + while (iii < readLength && bases[iii] == (byte) 'T') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'A') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'C') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + while (iii < readLength && bases[iii] == (byte) 'G') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + if (iii < readLength) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii++; + } + + } + } + else { // Negative direction + int iii = readLength - 1; + while (iii >= 0) { + while (iii >= 0 && bases[iii] == (byte) 'T') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'A') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'C') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + while (iii >= 0 && bases[iii] == (byte) 'G') { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + if (iii >= 0) { + if (multiplyByNegative1) + cycle--; + else + cycle++; + } + if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { + final int key = keyFromCycle(cycle); + values.addCovariate(key, key, key, iii); + iii--; + } + } + } + } + + // Unknown platforms + else { + throw new UserException("The platform (" + read.getReadGroup().getPlatform() + + ") associated with read group " + read.getReadGroup() + + " is not a recognized platform. Allowable options are " + NGSPlatform.knownPlatformsString()); + } + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return Integer.parseInt(str); + } + + @Override + public String formatKey(final int key) { + int cycle = key >> 1; // shift so we can remove the "sign" bit + if ( (key & 1) != 0 ) // is the last bit set? + cycle *= -1; // then the cycle is negative + return String.format("%d", cycle); + } + + @Override + public int keyFromValue(final Object value) { + return (value instanceof String) ? keyFromCycle(Integer.parseInt((String) value)) : keyFromCycle((Integer) value); + } + + @Override + public int maximumKeyValue() { + return (MAXIMUM_CYCLE_VALUE << 1) + 1; + } + + private int keyFromCycle(final int cycle) { + // no negative values because values must fit into the first few bits of the long + int result = Math.abs(cycle); + if ( result > MAXIMUM_CYCLE_VALUE ) + throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle (" + result + ") was detected. Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)"); + + result = result << 1; // shift so we can add the "sign" bit + if ( cycle < 0 ) + result++; // negative cycles get the lower-most bit set + return result; + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ExperimentalCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ExperimentalCovariate.java new file mode 100644 index 000000000..c276f43ec --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ExperimentalCovariate.java @@ -0,0 +1,81 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface ExperimentalCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/QualityScoreCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/QualityScoreCovariate.java new file mode 100644 index 000000000..889e00b9a --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/QualityScoreCovariate.java @@ -0,0 +1,129 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Nov 3, 2009 + * + * The Reported Quality Score covariate. + */ + +public class QualityScoreCovariate implements RequiredCovariate { + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) {} + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + final byte[] baseQualities = read.getBaseQualities(); + final byte[] baseInsertionQualities = read.getBaseInsertionQualities(); + final byte[] baseDeletionQualities = read.getBaseDeletionQualities(); + + for (int i = 0; i < baseQualities.length; i++) { + values.addCovariate((int)baseQualities[i], (int)baseInsertionQualities[i], (int)baseDeletionQualities[i], i); + } + } + + // Used to get the covariate's value from input csv file during on-the-fly recalibration + @Override + public final Object getValue(final String str) { + return Byte.parseByte(str); + } + + @Override + public String formatKey(final int key) { + return String.format("%d", key); + } + + @Override + public int keyFromValue(final Object value) { + return (value instanceof String) ? (int)Byte.parseByte((String) value) : (int)(Byte) value; + } + + @Override + public int maximumKeyValue() { + return QualityUtils.MAX_SAM_QUAL_SCORE; + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ReadGroupCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ReadGroupCovariate.java new file mode 100644 index 000000000..9f4c34463 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/ReadGroupCovariate.java @@ -0,0 +1,190 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Created by IntelliJ IDEA. + * User: rpoplin + * Date: Oct 30, 2009 + * + * The Read Group covariate. + */ + +public class ReadGroupCovariate implements RequiredCovariate { + + private final HashMap readGroupLookupTable = new HashMap(); + private final HashMap readGroupReverseLookupTable = new HashMap(); + private int nextId = 0; + private String forceReadGroup; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + forceReadGroup = RAC.FORCE_READGROUP; + } + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + final String readGroupId = readGroupValueFromRG(read.getReadGroup()); + final int key = keyForReadGroup(readGroupId); + + final int l = read.getReadLength(); + for (int i = 0; i < l; i++) + values.addCovariate(key, key, key, i); + } + + @Override + public final Object getValue(final String str) { + return str; + } + + @Override + public synchronized String formatKey(final int key) { + // This method is synchronized so that we don't attempt to do a get() + // from the reverse lookup table while that table is being updated + return readGroupReverseLookupTable.get(key); + } + + @Override + public int keyFromValue(final Object value) { + return keyForReadGroup((String) value); + } + + /** + * Get the mapping from read group names to integer key values for all read groups in this covariate + * @return a set of mappings from read group names -> integer key values + */ + public Set> getKeyMap() { + return readGroupLookupTable.entrySet(); + } + + private int keyForReadGroup(final String readGroupId) { + // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), + // synchronize only the table updates. + + // Before entering the synchronized block, check to see if this read group is not in our tables. + // If it's not, either we will have to insert it, OR another thread will insert it first. + // This preliminary check avoids doing any synchronization most of the time. + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + + synchronized ( this ) { + + // Now we need to make sure the key is STILL not there, since another thread may have come along + // and inserted it while we were waiting to enter this synchronized block! + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + } + } + + return readGroupLookupTable.get(readGroupId); + } + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + return readGroupLookupTable.size() - 1; + } + + /** + * If the sample has a PU tag annotation, return that. If not, return the read group id. + * + * @param rg the read group record + * @return platform unit or readgroup id + */ + private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) { + if ( forceReadGroup != null ) + return forceReadGroup; + + final String platformUnit = rg.getPlatformUnit(); + return platformUnit == null ? rg.getId() : platformUnit; + } + +} + + diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatCovariate.java new file mode 100644 index 000000000..64b32d766 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatCovariate.java @@ -0,0 +1,285 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationArgumentCollection; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.collections.Pair; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +public abstract class RepeatCovariate implements ExperimentalCovariate { + protected int MAX_REPEAT_LENGTH; + protected int MAX_STR_UNIT_LENGTH; + private final HashMap repeatLookupTable = new HashMap(); + private final HashMap repeatReverseLookupTable = new HashMap(); + private int nextId = 0; + + // Initialize any member variables using the command-line arguments passed to the walkers + @Override + public void initialize(final RecalibrationArgumentCollection RAC) { + MAX_STR_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; + MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; + } + + public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) { + this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH; + this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH; + } + + @Override + public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { + // store the original bases and then write Ns over low quality ones + final byte[] originalBases = read.getReadBases().clone(); + + final boolean negativeStrand = read.getReadNegativeStrandFlag(); + byte[] bases = read.getReadBases(); + if (negativeStrand) + bases = BaseUtils.simpleReverseComplement(bases); + + // don't record reads with N's + if (!BaseUtils.isAllRegularBases(bases)) + return; + + for (int i = 0; i < bases.length; i++) { + final Pair res = findTandemRepeatUnits(bases, i); + // to merge repeat unit and repeat length to get covariate value: + final String repeatID = getCovariateValueFromUnitAndLength(res.first, res.second); + final int key = keyForRepeat(repeatID); + + final int readOffset = (negativeStrand ? bases.length - i - 1 : i); + values.addCovariate(key, key, key, readOffset); + } + + // put the original bases back in + read.setReadBases(originalBases); + + } + + public Pair findTandemRepeatUnits(byte[] readBases, int offset) { + int maxBW = 0; + byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; + for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { + // fix repeat unit length + //edge case: if candidate tandem repeat unit falls beyond edge of read, skip + if (offset+1-str < 0) + break; + + // get backward repeat unit and # repeats + byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1); + maxBW = GATKVariantContextUtils.findNumberOfRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + if (maxBW > 1) { + bestBWRepeatUnit = backwardRepeatUnit.clone(); + break; + } + } + byte[] bestRepeatUnit = bestBWRepeatUnit; + int maxRL = maxBW; + + if (offset < readBases.length-1) { + byte[] bestFWRepeatUnit = new byte[]{readBases[offset+1]}; + int maxFW = 0; + for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { + // fix repeat unit length + //edge case: if candidate tandem repeat unit falls beyond edge of read, skip + if (offset+str+1 > readBases.length) + break; + + // get forward repeat unit and # repeats + byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1); + maxFW = GATKVariantContextUtils.findNumberOfRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true); + if (maxFW > 1) { + bestFWRepeatUnit = forwardRepeatUnit.clone(); + break; + } + } + // if FW repeat unit = BW repeat unit it means we're in the middle of a tandem repeat - add FW and BW components + if (Arrays.equals(bestFWRepeatUnit, bestBWRepeatUnit)) { + maxRL = maxBW + maxFW; + bestRepeatUnit = bestFWRepeatUnit; // arbitrary + } + else { + // tandem repeat starting forward from current offset. + // It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit. + // For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3. + // but correct representation at that place might be (C)4. + // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add + // representations to total + maxBW = GATKVariantContextUtils.findNumberOfRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); + maxRL = maxFW + maxBW; + bestRepeatUnit = bestFWRepeatUnit; + + } + + } + + + + if(maxRL > MAX_REPEAT_LENGTH) { maxRL = MAX_REPEAT_LENGTH; } + return new Pair(bestRepeatUnit, maxRL); + + } + @Override + public final Object getValue(final String str) { + return str; + } + + @Override + public synchronized String formatKey(final int key) { + // This method is synchronized so that we don't attempt to do a get() + // from the reverse lookup table while that table is being updated + return repeatReverseLookupTable.get(key); + } + + @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) + @Ensures("result != null") + protected abstract String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength); + + + @Override + public int keyFromValue(final Object value) { + return keyForRepeat((String) value); + } + + /** + * Get the mapping from read group names to integer key values for all read groups in this covariate + * @return a set of mappings from read group names -> integer key values + */ + public Set> getKeyMap() { + return repeatLookupTable.entrySet(); + } + + private int keyForRepeat(final String repeatID) { + // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), + // synchronize only the table updates. + + // Before entering the synchronized block, check to see if this read group is not in our tables. + // If it's not, either we will have to insert it, OR another thread will insert it first. + // This preliminary check avoids doing any synchronization most of the time. + if ( ! repeatLookupTable.containsKey(repeatID) ) { + + synchronized ( this ) { + + // Now we need to make sure the key is STILL not there, since another thread may have come along + // and inserted it while we were waiting to enter this synchronized block! + if ( ! repeatLookupTable.containsKey(repeatID) ) { + repeatLookupTable.put(repeatID, nextId); + repeatReverseLookupTable.put(nextId, repeatID); + nextId++; + } + } + } + + return repeatLookupTable.get(repeatID); + } + + + /** + * Splits repeat unit and num repetitions from covariate value. + * For example, if value if "ATG4" it returns (ATG,4) + * @param value Covariate value + * @return Split pair + */ + @Requires("value != null") + @Ensures({"result.first != null","result.second>=0"}) + public static Pair getRUandNRfromCovariate(final String value) { + + int k = 0; + for ( k=0; k < value.length(); k++ ) { + if (!BaseUtils.isRegularBase(value.getBytes()[k])) + break; + } + Integer nr = Integer.valueOf(value.substring(k,value.length())); // will throw NumberFormatException if format illegal + if (k == value.length() || nr <= 0) + throw new IllegalStateException("Covariate is not of form (Repeat Unit) + Integer"); + + return new Pair(value.substring(0,k), nr); + } + + /** + * Gets bases from tandem repeat representation (Repeat Unit),(Number of Repeats). + * For example, (AGC),3 returns AGCAGCAGC + * @param repeatUnit Tandem repeat unit + * @param numRepeats Number of repeats + * @return Expanded String + */ + @Requires({"numRepeats > 0","repeatUnit != null"}) + @Ensures("result != null") + public static String getBasesFromRUandNR(final String repeatUnit, final int numRepeats) { + final StringBuilder sb = new StringBuilder(); + + for (int i=0; i < numRepeats; i++) + sb.append(repeatUnit); + + return sb.toString(); + } + + // version given covariate key + public static String getBasesFromRUandNR(final String covariateValue) { + Pair pair = getRUandNRfromCovariate(covariateValue); + return getBasesFromRUandNR(pair.getFirst(), pair.getSecond()); + } + + @Override + public abstract int maximumKeyValue(); + + + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatLengthCovariate.java new file mode 100644 index 000000000..fb6aeaf85 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatLengthCovariate.java @@ -0,0 +1,74 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +public class RepeatLengthCovariate extends RepeatCovariate { + + @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) + @Ensures("result != null") + protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { + return String.format("%d",repeatLength); + } + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + //return repeatLookupTable.size() - 1; + // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, + // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values + return (1+MAX_REPEAT_LENGTH); + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitAndLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitAndLengthCovariate.java new file mode 100644 index 000000000..10a7f6672 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitAndLengthCovariate.java @@ -0,0 +1,75 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + + +public class RepeatUnitAndLengthCovariate extends RepeatCovariate { + + @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) + @Ensures("result != null") + protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { + return new String(repeatFromUnitAndLength) + String.format("%d",repeatLength); + } + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + //return repeatLookupTable.size() - 1; + // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, + // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values + return (1<<(2*MAX_STR_UNIT_LENGTH)) * MAX_REPEAT_LENGTH +1; + } + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitCovariate.java new file mode 100644 index 000000000..d961b1460 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RepeatUnitCovariate.java @@ -0,0 +1,78 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 11/3/12 + */ + +public class RepeatUnitCovariate extends RepeatCovariate { + + protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { + return new String(repeatFromUnitAndLength); + + } + + + @Override + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated + //return repeatLookupTable.size() - 1; + // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, + // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values + return (1<<(2*MAX_STR_UNIT_LENGTH)) +1; + } + + +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RequiredCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RequiredCovariate.java new file mode 100644 index 000000000..8f2155ff2 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/RequiredCovariate.java @@ -0,0 +1,81 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface RequiredCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/StandardCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/StandardCovariate.java new file mode 100644 index 000000000..82e2bd199 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/engine/recalibration/covariates/StandardCovariate.java @@ -0,0 +1,81 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface StandardCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java index 949b61ec1..d8e30983e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleCountBySample.java @@ -58,9 +58,9 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineCount; import htsjdk.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeAlleleCounts; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java index e1ffbb0f3..6f0bba28e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java @@ -52,9 +52,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; @@ -65,6 +65,7 @@ import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants; import java.util.*; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java index fad666f80..8a0777245 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/Coverage.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java index 6fd39555e..f1848260d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerAlleleBySample.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java index d71d58853..3cfa0a94a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java index 0972038d6..1d051d5da 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java @@ -52,23 +52,15 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import cern.jet.math.Arithmetic; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.QualityUtils; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import java.util.*; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java index 54535e32c..6d3dfd6d8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GCContent.java @@ -51,19 +51,15 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.help.HelpConstants; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; import java.util.Arrays; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java index 2460e45be..f3a95e6ae 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/GenotypeSummaries.java @@ -51,15 +51,14 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeaderLineType; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java index 9b5778c1d..68da5b951 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HaplotypeScore.java @@ -51,10 +51,10 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; @@ -70,7 +70,6 @@ import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java index b511a1b90..07bcd5079 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HardyWeinberg.java @@ -52,13 +52,12 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.WorkInProgressAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.QualityUtils; import htsjdk.variant.vcf.VCFHeaderLineType; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java index 4c77ffff4..a0148eac2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HomopolymerRun.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index e44ff0635..a89d2ea84 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -52,9 +52,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java index a30924187..185b5b59c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MVLikelihoodRatio.java @@ -51,16 +51,16 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Trio; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.MendelianViolation; +import org.broadinstitute.gatk.engine.samples.MendelianViolation; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java index 49bc74161..57f42ef4b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZero.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java index 1f677c6d0..408217af8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/PossibleDeNovo.java @@ -51,10 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.samples.Sample; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Trio; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; @@ -62,7 +61,7 @@ import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAn import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.MendelianViolation; +import org.broadinstitute.gatk.engine.samples.MendelianViolation; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java index 004e5d18f..49db309f3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java @@ -51,10 +51,10 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; @@ -189,7 +189,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( QD < MAX_QD_BEFORE_FIXING ) { return QD; } else { - return IDEAL_HIGH_QD + GenomeAnalysisEngine.getRandomGenerator().nextGaussian() * JITTER_SIGMA; + return IDEAL_HIGH_QD + Utils.getRandomGenerator().nextGaussian() * JITTER_SIGMA; } } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java index fbeea3331..038545cf4 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java index eb70a19a2..c257a05ff 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java @@ -52,9 +52,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java index b0f298048..e82b485f3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SampleList.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java index 6b7b21b30..f996dd991 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SpanningDeletions.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java index 143b16edd..0da2932f5 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java index 3791b7912..4db1fbaa7 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java @@ -55,12 +55,11 @@ import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import htsjdk.variant.variantcontext.Genotype; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java index 41f34b5f8..a8714aca2 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java @@ -51,12 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java index 4163f7bb7..81d5ee9d0 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TandemRepeatAnnotator.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java index 430c71597..932eced35 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/TransmissionDisequilibriumTest.java @@ -51,12 +51,11 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Sample; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java index 49adb5161..5b4bab87e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantType.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java index 45ab38542..ca0dcb8db 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/AnalyzeCovariates.java @@ -56,17 +56,17 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.recalibration.RecalUtils; -import org.broadinstitute.gatk.utils.recalibration.RecalibrationReport; -import org.broadinstitute.gatk.utils.recalibration.BaseRecalibration; +import org.broadinstitute.gatk.engine.recalibration.RecalUtils; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationReport; +import org.broadinstitute.gatk.engine.recalibration.BaseRecalibration; import java.io.File; import java.io.FileNotFoundException; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGatherer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGatherer.java deleted file mode 100644 index d9f59d856..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGatherer.java +++ /dev/null @@ -1,139 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.tools.walkers.bqsr; - -import org.apache.commons.collections.CollectionUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.commandline.Gatherer; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.RecalibrationReport; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.*; - -/** - * User: carneiro - * Date: 3/29/11 - */ - - -public class BQSRGatherer extends Gatherer { - - private static final Logger logger = Logger.getLogger(BQSRGatherer.class); - private static final String EMPTY_INPUT_LIST = "list of inputs files is empty or there is no usable data in any input file"; - private static final String MISSING_OUTPUT_FILE = "missing output file name"; - private static final String MISSING_READ_GROUPS = "Missing read group(s)"; - - @Override - public void gather(final List inputs, final File output) { - final PrintStream outputFile; - try { - outputFile = new PrintStream(output); - } catch(FileNotFoundException e) { - throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); - } - final GATKReport report = gatherReport(inputs); - report.print(outputFile); - } - - /** - * Gathers the input recalibration reports into a single report. - * - * @param inputs Input recalibration GATK reports - * @return gathered recalibration GATK report - */ - public static GATKReport gatherReport(final List inputs) { - final SortedSet allReadGroups = new TreeSet(); - final LinkedHashMap> inputReadGroups = new LinkedHashMap>(); - - // Get the read groups from each input report - for (final File input : inputs) { - final Set readGroups = RecalibrationReport.getReadGroups(input); - inputReadGroups.put(input, readGroups); - allReadGroups.addAll(readGroups); - } - - // Log the read groups that are missing from specific inputs - for (Map.Entry> entry: inputReadGroups.entrySet()) { - final File input = entry.getKey(); - final Set readGroups = entry.getValue(); - if (allReadGroups.size() != readGroups.size()) { - // Since this is not completely unexpected, more than debug, but less than a proper warning. - logger.info(MISSING_READ_GROUPS + ": " + input.getAbsolutePath()); - for (final Object readGroup: CollectionUtils.subtract(allReadGroups, readGroups)) { - logger.info(" " + readGroup); - } - } - } - - RecalibrationReport generalReport = null; - for (File input : inputs) { - final RecalibrationReport inputReport = new RecalibrationReport(input, allReadGroups); - if( inputReport.isEmpty() ) { continue; } - - if (generalReport == null) - generalReport = inputReport; - else - generalReport.combine(inputReport); - } - if (generalReport == null) - throw new ReviewedGATKException(EMPTY_INPUT_LIST); - - generalReport.calculateQuantizedQualities(); - - return generalReport.createGATKReport(); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java index fd87c7f31..3aa4ba5dd 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/BaseRecalibrator.java @@ -55,15 +55,16 @@ import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.CigarElement; import htsjdk.samtools.SAMFileHeader; import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.engine.recalibration.*; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.filters.*; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.baq.BAQ; @@ -74,7 +75,7 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.recalibration.*; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.sam.ReadUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java index 53d26c176..c3914216d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfo.java @@ -55,7 +55,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.recalibration.EventType; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationArgumentCollection.java deleted file mode 100644 index 4bc4af2e4..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationArgumentCollection.java +++ /dev/null @@ -1,420 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.tools.walkers.bqsr; - -import com.google.java.contract.Requires; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.recalibration.RecalUtils; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 27, 2009 - * - * A collection of the arguments that are used for BQSR. Used to be common to both CovariateCounterWalker and TableRecalibrationWalker. - * This set of arguments will also be passed to the constructor of every Covariate when it is instantiated. - */ - -public class RecalibrationArgumentCollection implements Cloneable { - - /** - * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, - * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) - * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. - * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. - */ - @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) - public List> knownSites = Collections.emptyList(); - - /** - * After the header, data records occur one per line until the end of the file. The first several items on a line are the - * values of the individual covariates and will change depending on which covariates were specified at runtime. The last - * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, - * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. - */ - @Gather(BQSRGatherer.class) - @Output(doc = "The output recalibration table file to create", required = true) - public File RECAL_TABLE_FILE = null; - public PrintStream RECAL_TABLE; - - /** - * Note that the --list argument requires a fully resolved and correct command-line to work. - */ - @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) - public boolean LIST_ONLY = false; - - /** - * Note that the ReadGroup and QualityScore covariates are required and do not need to be specified. - * Also, unless --no_standard_covs is specified, the Cycle and Context covariates are standard and are included by default. - * Use the --list argument to see the available covariates. - */ - @Argument(fullName = "covariate", shortName = "cov", doc = "One or more covariates to be used in the recalibration. Can be specified multiple times", required = false) - public String[] COVARIATES = null; - - /* - * The Cycle and Context covariates are standard and are included by default unless this argument is provided. - * Note that the ReadGroup and QualityScore covariates are required and cannot be excluded. - */ - @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) - public boolean DO_NOT_USE_STANDARD_COVARIATES = false; - - /** - * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. - */ - @Advanced - @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") - public boolean RUN_WITHOUT_DBSNP = false; - - /** - * BaseRecalibrator accepts a --solid_recal_mode flag which governs how the recalibrator handles the - * reads which have had the reference inserted because of color space inconsistencies. - */ - @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") - public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; - - /** - * BaseRecalibrator accepts a --solid_nocall_strategy flag which governs how the recalibrator handles - * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in - * their color space tag can not be recalibrated. - */ - @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) - public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; - - /** - * The context covariate will use a context of this size to calculate its covariate value for base mismatches. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. - */ - @Argument(fullName = "mismatches_context_size", shortName = "mcs", doc = "Size of the k-mer context to be used for base mismatches", required = false) - public int MISMATCHES_CONTEXT_SIZE = 2; - - /** - * The context covariate will use a context of this size to calculate its covariate value for base insertions and deletions. Must be between 1 and 13 (inclusive). Note that higher values will increase runtime and required java heap size. - */ - @Argument(fullName = "indels_context_size", shortName = "ics", doc = "Size of the k-mer context to be used for base insertions and deletions", required = false) - public int INDELS_CONTEXT_SIZE = 3; - - /** - * The cycle covariate will generate an error if it encounters a cycle greater than this value. - * This argument is ignored if the Cycle covariate is not used. - */ - @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "The maximum cycle value permitted for the Cycle covariate", required = false) - public int MAXIMUM_CYCLE_VALUE = 500; - - /** - * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is off] - */ - @Argument(fullName = "mismatches_default_quality", shortName = "mdq", doc = "default quality for the base mismatches covariate", required = false) - public byte MISMATCHES_DEFAULT_QUALITY = -1; - - /** - * A default base qualities to use as a prior (reported quality) in the insertion covariate model. This parameter is used for all reads without insertion quality scores for each base. [default is on] - */ - @Argument(fullName = "insertions_default_quality", shortName = "idq", doc = "default quality for the base insertions covariate", required = false) - public byte INSERTIONS_DEFAULT_QUALITY = 45; - - /** - * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off. [default is on] - */ - @Argument(fullName = "deletions_default_quality", shortName = "ddq", doc = "default quality for the base deletions covariate", required = false) - public byte DELETIONS_DEFAULT_QUALITY = 45; - - /** - * Reads with low quality bases on either tail (beginning or end) will not be considered in the context. This parameter defines the quality below which (inclusive) a tail is considered low quality - */ - @Argument(fullName = "low_quality_tail", shortName = "lqt", doc = "minimum quality for the bases in the tail of the reads to be considered", required = false) - public byte LOW_QUAL_TAIL = 2; - - /** - * BQSR generates a quantization table for quick quantization later by subsequent tools. BQSR does not quantize the base qualities, this is done by the engine with the -qq or -BQSR options. - * This parameter tells BQSR the number of levels of quantization to use to build the quantization table. - */ - @Argument(fullName = "quantizing_levels", shortName = "ql", required = false, doc = "number of distinct quality scores in the quantized output") - public int QUANTIZING_LEVELS = 16; - - /** - * The tag name for the binary tag covariate (if using it) - */ - @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") - public String BINARY_TAG_NAME = null; - - /* - * whether GATK report tables should have rows in sorted order, starting from leftmost column - */ - @Argument(fullName = "sort_by_all_columns", shortName = "sortAllCols", doc = "Sort the rows in the tables of reports", required = false) - public Boolean SORT_BY_ALL_COLUMNS = false; - - ///////////////////////////// - // Debugging-only Arguments - ///////////////////////////// - - @Hidden - @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") - public String DEFAULT_PLATFORM = null; - - @Hidden - @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") - public String FORCE_PLATFORM = null; - - @Hidden - @Argument(fullName = "force_readgroup", shortName = "fRG", required = false, doc = "If provided, the read group of EVERY read will be forced to be the provided String.") - public String FORCE_READGROUP = null; - - @Hidden - @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only", defaultToStdout = false) - public PrintStream RECAL_TABLE_UPDATE_LOG = null; - - /** - * The repeat covariate will use a context of this size to calculate it's covariate value for base insertions and deletions - */ - @Hidden - @Argument(fullName = "max_str_unit_length", shortName = "maxstr", doc = "Max size of the k-mer context to be used for repeat covariates", required = false) - public int MAX_STR_UNIT_LENGTH = 8; - - @Hidden - @Argument(fullName = "max_repeat_length", shortName = "maxrep", doc = "Max number of repetitions to be used for repeat covariates", required = false) - public int MAX_REPEAT_LENGTH = 20; - - - public File existingRecalibrationReport = null; - - public GATKReportTable generateReportTable(final String covariateNames) { - GATKReportTable argumentsTable; - if(SORT_BY_ALL_COLUMNS) { - argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); - } else { - argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); - } - argumentsTable.addColumn("Argument"); - argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); - argumentsTable.addRowID("covariate", true); - argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, covariateNames); - argumentsTable.addRowID("no_standard_covs", true); - argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); - argumentsTable.addRowID("run_without_dbsnp", true); - argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); - argumentsTable.addRowID("solid_recal_mode", true); - argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); - argumentsTable.addRowID("solid_nocall_strategy", true); - argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); - argumentsTable.addRowID("mismatches_context_size", true); - argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); - argumentsTable.addRowID("indels_context_size", true); - argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); - argumentsTable.addRowID("mismatches_default_quality", true); - argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); - argumentsTable.addRowID("deletions_default_quality", true); - argumentsTable.set("deletions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DELETIONS_DEFAULT_QUALITY); - argumentsTable.addRowID("insertions_default_quality", true); - argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); - argumentsTable.addRowID("maximum_cycle_value", true); - argumentsTable.set("maximum_cycle_value", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MAXIMUM_CYCLE_VALUE); - argumentsTable.addRowID("low_quality_tail", true); - argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); - argumentsTable.addRowID("default_platform", true); - argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); - argumentsTable.addRowID("force_platform", true); - argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); - argumentsTable.addRowID("quantizing_levels", true); - argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); - argumentsTable.addRowID("recalibration_report", true); - argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); - argumentsTable.addRowID("binary_tag_name", true); - argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); - return argumentsTable; - } - - /** - * Returns a map with the arguments that differ between this an - * another {@link RecalibrationArgumentCollection} instance. - *

- * The key is the name of that argument in the report file. The value is a message - * that explains the difference to the end user. - *

- * Thus, a empty map indicates that there is no differences between both argument collection that - * is relevant to report comparison. - *

- * This method should not throw any exception. - * - * @param other the argument-collection to compare against. - * @param thisRole the name used to refer to this RAC report that makes sense to the end user. - * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. - * - * @return never null, but a zero-size collection if there are no differences. - */ - @Requires("other != null && thisRole != null && otherRole != null && !thisRole.equalsIgnoreCase(otherRole)") - Map compareReportArguments(final RecalibrationArgumentCollection other,final String thisRole, final String otherRole) { - final Map result = new LinkedHashMap<>(15); - compareRequestedCovariates(result, other, thisRole, otherRole); - compareSimpleReportArgument(result,"no_standard_covs", DO_NOT_USE_STANDARD_COVARIATES, other.DO_NOT_USE_STANDARD_COVARIATES, thisRole, otherRole); - compareSimpleReportArgument(result,"run_without_dbsnp",RUN_WITHOUT_DBSNP,other.RUN_WITHOUT_DBSNP,thisRole,otherRole); - compareSimpleReportArgument(result,"solid_recal_mode", SOLID_RECAL_MODE, other.SOLID_RECAL_MODE,thisRole,otherRole); - compareSimpleReportArgument(result,"solid_nocall_strategy", SOLID_NOCALL_STRATEGY, other.SOLID_NOCALL_STRATEGY,thisRole,otherRole); - compareSimpleReportArgument(result,"mismatches_context_size", MISMATCHES_CONTEXT_SIZE,other.MISMATCHES_CONTEXT_SIZE,thisRole,otherRole); - compareSimpleReportArgument(result,"mismatches_default_quality", MISMATCHES_DEFAULT_QUALITY, other.MISMATCHES_DEFAULT_QUALITY,thisRole,otherRole); - compareSimpleReportArgument(result,"deletions_default_quality", DELETIONS_DEFAULT_QUALITY, other.DELETIONS_DEFAULT_QUALITY,thisRole,otherRole); - compareSimpleReportArgument(result,"insertions_default_quality", INSERTIONS_DEFAULT_QUALITY, other.INSERTIONS_DEFAULT_QUALITY,thisRole,otherRole); - compareSimpleReportArgument(result,"maximum_cycle_value", MAXIMUM_CYCLE_VALUE, other.MAXIMUM_CYCLE_VALUE,thisRole,otherRole); - compareSimpleReportArgument(result,"low_quality_tail", LOW_QUAL_TAIL, other.LOW_QUAL_TAIL,thisRole,otherRole); - compareSimpleReportArgument(result,"default_platform", DEFAULT_PLATFORM, other.DEFAULT_PLATFORM,thisRole,otherRole); - compareSimpleReportArgument(result,"force_platform", FORCE_PLATFORM, other.FORCE_PLATFORM,thisRole,otherRole); - compareSimpleReportArgument(result,"quantizing_levels", QUANTIZING_LEVELS, other.QUANTIZING_LEVELS,thisRole,otherRole); - compareSimpleReportArgument(result,"binary_tag_name", BINARY_TAG_NAME, other.BINARY_TAG_NAME,thisRole,otherRole); - return result; - } - - - /** - * Compares the covariate report lists. - * - * @param diffs map where to annotate the difference. - * @param other the argument collection to compare against. - * @param thisRole the name for this argument collection that makes sense to the user. - * @param otherRole the name for the other argument collection that makes sense to the end user. - * - * @return true if a difference was found. - */ - @Requires("diffs != null && other != null && thisRole != null && otherRole != null") - private boolean compareRequestedCovariates(final Map diffs, - final RecalibrationArgumentCollection other, final String thisRole, final String otherRole) { - - final Set beforeNames = new HashSet<>(this.COVARIATES.length); - final Set afterNames = new HashSet<>(other.COVARIATES.length); - Utils.addAll(beforeNames, this.COVARIATES); - Utils.addAll(afterNames,other.COVARIATES); - final Set intersect = new HashSet<>(Math.min(beforeNames.size(),afterNames.size())); - intersect.addAll(beforeNames); - intersect.retainAll(afterNames); - - String diffMessage = null; - if (intersect.size() == 0) { // In practice this is not possible due to required covariates but... - diffMessage = String.format("There are no common covariates between '%s' and '%s'" - + " recalibrator reports. Covariates in '%s': {%s}. Covariates in '%s': {%s}.",thisRole,otherRole, - thisRole,Utils.join(", ",this.COVARIATES), - otherRole,Utils.join(",",other.COVARIATES)); - } else if (intersect.size() != beforeNames.size() || intersect.size() != afterNames.size()) { - beforeNames.removeAll(intersect); - afterNames.removeAll(intersect); - diffMessage = String.format("There are differences in the set of covariates requested in the" - + " '%s' and '%s' recalibrator reports. " - + " Exclusive to '%s': {%s}. Exclusive to '%s': {%s}.",thisRole,otherRole, - thisRole,Utils.join(", ",beforeNames), - otherRole,Utils.join(", ",afterNames)); - } - if (diffMessage != null) { - diffs.put("covariate",diffMessage); - return true; - } else { - return false; - } - } - - /** - * Annotates a map with any difference encountered in a simple value report argument that differs between this an - * another {@link RecalibrationArgumentCollection} instance. - *

- * The key of the new entry would be the name of that argument in the report file. The value is a message - * that explains the difference to the end user. - *

- * - *

- * This method should not return any exception. - * - * @param diffs where to annotate the differences. - * @param name the name of the report argument to compare. - * @param thisValue this argument collection value for that argument. - * @param otherValue the other collection value for that argument. - * @param thisRole the name used to refer to this RAC report that makes sense to the end user. - * @param otherRole the name used to refer to the other RAC report that makes sense to the end user. - * - * @type T the argument Object value type. - * - * @return true if a difference has been spotted, thus diff has been modified. - */ - private boolean compareSimpleReportArgument(final Map diffs, - final String name, final T thisValue, final T otherValue, final String thisRole, final String otherRole) { - if (thisValue == null && otherValue == null) { - return false; - } else if (thisValue != null && thisValue.equals(otherValue)) { - return false; - } else { - diffs.put(name, - String.format("differences between '%s' {%s} and '%s' {%s}.", - thisRole,thisValue == null ? "" : thisValue, - otherRole,otherValue == null ? "" : otherValue)); - return true; - } - - } - - /** - * Create a shallow copy of this argument collection. - * - * @return never null. - */ - @Override - public RecalibrationArgumentCollection clone() { - try { - return (RecalibrationArgumentCollection) super.clone(); - } catch (CloneNotSupportedException e) { - throw new GATKException("Unreachable code clone not supported thrown when the class " - + this.getClass().getName() + " is cloneable ",e); - } - } - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java index 52a34aa54..aa20c9656 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/bqsr/RecalibrationEngine.java @@ -52,9 +52,13 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; import com.google.java.contract.Requires; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.RecalDatum; +import org.broadinstitute.gatk.engine.recalibration.RecalUtils; +import org.broadinstitute.gatk.engine.recalibration.RecalibrationTables; import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; import org.broadinstitute.gatk.utils.recalibration.*; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.PrintStream; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java index e4735505b..d5683504c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/BaseCoverageDistribution.java @@ -54,10 +54,10 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java index 18fe381a3..855d5446c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/FindCoveredIntervals.java @@ -54,9 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.ActiveRegionTraversalParameters; import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; import org.broadinstitute.gatk.engine.walkers.PartitionBy; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java index 9240c56e3..d2cb4439b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/DiagnoseTargets.java @@ -56,12 +56,11 @@ import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; import org.broadinstitute.gatk.utils.classloader.PluginManager; import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -70,6 +69,7 @@ import org.broadinstitute.gatk.utils.help.HelpConstants; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.utils.sam.ReadUtils; import java.io.PrintStream; import java.util.*; @@ -154,7 +154,7 @@ public class DiagnoseTargets extends LocusWalker { intervalListIterator = new PeekableIterator(getToolkit().getIntervals().iterator()); // get all of the unique sample names for the VCF Header - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + samples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); vcfWriter.writeHeader(new VCFHeader(getHeaderInfo(), samples)); // pre load all the statistics classes because it is costly to operate on the JVM and we only want to do it once. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java index 715ee5b05..4261eee4c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/diagnosetargets/IntervalStratification.java @@ -51,8 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics.diagnosetargets; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java index ef83c71e3..d2bca0222 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/missing/QualifyMissingIntervals.java @@ -56,11 +56,11 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Gather; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportGatherer; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportGatherer; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.GenomeLocSortedSet; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java index 4993d5614..54ea7da0a 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleLikelihoodMatrixMapper.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java index 53523472a..96f432dc1 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ConsensusAlleleCounter.java @@ -52,11 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.collections.Pair; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java index 60799caca..71587fe3e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/ErrorModel.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import com.google.java.contract.Requires; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.MathUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index 03b3e3374..8d07268d3 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -52,10 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index 0964a74ab..3a65a3a9e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.ExactACset; import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.gatk.utils.haplotype.Haplotype; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index 693982b2f..c0e2ea95e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -52,10 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.haplotype.Haplotype; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java index ecb66cdf9..93b7524db 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java @@ -77,10 +77,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index c3208ce8b..b32f291f9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -52,10 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java index 1f73c7140..c73690a84 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingData.java @@ -52,7 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; /** * Encapsulates the data use to make the genotype calls. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index 60254fdc8..63caf8a14 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -58,11 +58,12 @@ import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculator; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java index 873318532..f06a40b73 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingLikelihoods.java @@ -53,6 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import java.util.List; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java index f2413f122..4dfb8d312 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingModel.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; /** * Common interface for genotyping models. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java index bc7691e46..dee370eec 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModel.java @@ -51,6 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; +import org.broadinstitute.gatk.utils.genotyper.SampleList; + /** * {@link PloidyModel} implementation tailored to work with a homogeneous constant ploidy * across samples and positions. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 10ffeffd2..7ee4a9aca 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -52,10 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java index 320ccebe7..1589e8374 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModel.java @@ -53,6 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import java.util.ArrayList; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java index e21494985..1ad1a2241 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/PloidyModel.java @@ -51,6 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; +import org.broadinstitute.gatk.utils.genotyper.SampleList; + /** * Information about the number of chromosome per sample at a given location. * diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 39a37b642..66be00a08 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -52,10 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollection.java new file mode 100644 index 000000000..b2bd306fc --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollection.java @@ -0,0 +1,231 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.genotyper; + +import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorImplementation; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.collections.DefaultHashMap; +import htsjdk.variant.variantcontext.VariantContext; + +import java.io.File; +import java.lang.reflect.Field; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; +import java.util.Collections; +import java.util.Map; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 8/20/12 + * A collection of arguments that are common to the various callers. + * This is pulled out so that every caller isn't exposed to the arguments from every other caller. + */ + +public class StandardCallerArgumentCollection implements Cloneable { + + @ArgumentCollection + public GenotypeCalculationArgumentCollection genotypeArgs = new GenotypeCalculationArgumentCollection(); + + @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false) + public GenotypingOutputMode genotypingOutputMode = GenotypingOutputMode.DISCOVERY; + + /** + * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding + */ + @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false) + public RodBinding alleles; + + /** + * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads. + * Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we + * will try to remove (N * contamination fraction) bases for each alternate allele. + */ + @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false) + public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION; + public static final double DEFAULT_CONTAMINATION_FRACTION = 0.0; + + /** + * This argument specifies a file with two columns "sample" and "contamination" specifying the contamination level for those samples. + * Samples that do not appear in this file will be processed with CONTAMINATION_FRACTION. + **/ + @Advanced + @Argument(fullName = "contamination_fraction_per_sample_file", shortName = "contaminationFile", doc = "Tab-separated File containing fraction of contamination in sequencing data (per sample) to aggressively remove. Format should be \"\" (Contamination is double) per line; No header.", required = false) + public File CONTAMINATION_FRACTION_FILE = null; + + /** + * Indicates whether there is some sample contamination present. + */ + private boolean sampleContaminationWasLoaded = false; + + /** + * + * @return an _Immutable_ copy of the Sample-Contamination Map, defaulting to CONTAMINATION_FRACTION so that if the sample isn't in the map map(sample)==CONTAMINATION_FRACTION + */ + public Map getSampleContamination(){ + //make sure that the default value is set up right + sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); + if (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) + sampleContaminationWasLoaded = true; + return Collections.unmodifiableMap(sampleContamination); + } + + public void setSampleContamination(DefaultHashMap sampleContamination) { + this.sampleContamination.clear(); + this.sampleContaminationWasLoaded = !Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0; + if (!sampleContaminationWasLoaded) + for (final Double d : sampleContamination.values()) + if (!Double.isNaN(d) && d > 0.0) { + sampleContaminationWasLoaded = true; + break; + } + this.sampleContamination.putAll(sampleContamination); + this.sampleContamination.setDefaultValue(CONTAMINATION_FRACTION); + } + + /** + * Returns true if there is some sample contamination present, false otherwise. + * @return {@code true} iff there is some sample contamination + */ + public boolean isSampleContaminationPresent() { + return (!Double.isNaN(CONTAMINATION_FRACTION) && CONTAMINATION_FRACTION > 0.0) || sampleContaminationWasLoaded; + } + + //Needs to be here because it uses CONTAMINATION_FRACTION + private DefaultHashMap sampleContamination = new DefaultHashMap(CONTAMINATION_FRACTION); + + /** + * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. + */ + @Hidden + @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) + public AFCalculatorImplementation requestedAlleleFrequencyCalculationModel; + + @Hidden + @Argument(shortName = "logExactCalls", doc="x", required=false) + public File exactCallsLog = null; + + @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false) + public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY; + + /** + * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites. + * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any). + * WARNINGS: + * - This feature will inflate VCF file size considerably. + * - All SNP ALT alleles will be emitted with corresponding 10 PL values. + * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used + */ + @Advanced + @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false) + public boolean annotateAllSitesWithPLs = false; + + /** + * Creates a Standard caller argument collection with default values. + */ + public StandardCallerArgumentCollection() { } + + /** + * "Casts" a caller argument collection into another type. + * + *

Common fields values are copied across

+ * @param clazz the class of the result. + * @param result argument collection class. + * @return never {@code null}. + */ + public T cloneTo(final Class clazz) { + // short cut: just use regular clone if it happens to be the same class. + if (clazz == getClass()) + return (T) clone(); + try { + final T result = clazz.newInstance(); + for (final Field field : getClass().getFields()) { + // just copy common fields. + if (!field.getDeclaringClass().isAssignableFrom(clazz)) + continue; + final int fieldModifiers = field.getModifiers(); + if ((fieldModifiers & UNCOPYABLE_MODIFIER_MASK) != 0) continue; + //Use the clone() method if appropriate + if (Cloneable.class.isAssignableFrom(field.getType())) { + Method clone = field.getType().getMethod("clone"); + field.set(result, clone.invoke(field.get(this))); + } else + field.set(result,field.get(this)); + } + return result; + } catch (final Exception ex) { + throw new IllegalStateException(ex); + } + } + + /** + * Creates a copy of this configuration. + * @return never {@code null}. + */ + @Override + public StandardCallerArgumentCollection clone() { + try { + StandardCallerArgumentCollection cloned = (StandardCallerArgumentCollection) super.clone(); + cloned.genotypeArgs = genotypeArgs.clone(); + return cloned; + } catch (CloneNotSupportedException e) { + throw new IllegalStateException("unreachable code"); + } + } + + /** + * Holds a modifiers mask that identifies those fields that cannot be copied between + * StandardCallerArgumentCollections. + */ + private final int UNCOPYABLE_MODIFIER_MASK = Modifier.PRIVATE | Modifier.STATIC | Modifier.FINAL; +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java index cc3ba9353..08e13da1b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedArgumentCollection.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; import htsjdk.variant.variantcontext.VariantContext; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java index 21f445f7e..2f23f4852 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyper.java @@ -58,25 +58,27 @@ import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; import org.broadinstitute.gatk.engine.filters.BadMateFilter; import org.broadinstitute.gatk.engine.filters.MappingQualityUnavailableFilter; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider; -import org.broadinstitute.gatk.utils.SampleUtils; import org.broadinstitute.gatk.utils.baq.BAQ; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; +import org.broadinstitute.gatk.utils.sam.ReadUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import java.io.PrintStream; @@ -267,7 +269,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif sampleNameSet = Collections.singleton(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME); } else { // get all of the unique sample names - sampleNameSet = SampleUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); + sampleNameSet = ReadUtils.getSAMFileSamples(toolkit.getSAMFileHeader()); if ( UAC.referenceSampleName != null ) sampleNameSet.remove(UAC.referenceSampleName); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java index 6b6b66062..1753bb294 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java @@ -56,10 +56,11 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculationResult; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.utils.BaseUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java index 69175a29c..e97b9ca79 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorPerformanceTest.java @@ -54,8 +54,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import org.apache.log4j.ConsoleAppender; import org.apache.log4j.Logger; import org.apache.log4j.TTCCLayout; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.SimpleTimer; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java index 132a1b7cb..53bfbcbe6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProvider.java @@ -56,7 +56,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; /** * A single fixed instance AF calculator provider. diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index 31dab29e7..b70765402 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java index 3b01d036e..91120f43d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngineInstance.java @@ -53,9 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.variant.variantcontext.Allele; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.MultiSampleEdge; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Path; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.graphs.Route; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index f4622fa30..1b9b9e106 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -59,16 +59,18 @@ import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingUtils; +import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingUtils; import org.broadinstitute.gatk.engine.filters.BadMateFilter; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.genotyper.*; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; @@ -88,12 +90,9 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.fragments.FragmentCollection; import org.broadinstitute.gatk.utils.fragments.FragmentUtils; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.gga.GenotypingGivenAllelesUtils; import org.broadinstitute.gatk.utils.gvcf.GVCFWriter; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.haplotype.LDMerger; -import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.gatk.utils.haplotypeBAMWriter.HaplotypeBAMWriter; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; @@ -1167,7 +1166,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In } catch ( final Exception e ) { // Capture any exception that might be thrown, and write out the assembly failure BAM if requested if ( captureAssemblyFailureBAM ) { - final SAMFileWriter writer = ReadUtils.createSAMFileWriter("assemblyFailure.bam", getToolkit()); + final SAMFileWriter writer = SAMFileWriterStub.createSAMFileWriter("assemblyFailure.bam", getToolkit()); for ( final GATKSAMRecord read : activeRegion.getReads() ) { writer.addAlignment(read); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java index c5d0073f2..c68245e7f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java index 5481cfdeb..2f9dfa288 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java @@ -54,7 +54,10 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.AFCalculatorProvider; import org.broadinstitute.gatk.utils.GenomeLoc; @@ -64,7 +67,6 @@ import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.EventMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.haplotype.MergeVariantsAcrossHaplotypes; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculator.java new file mode 100644 index 000000000..cf202bf5f --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculator.java @@ -0,0 +1,204 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; + +import java.util.*; + +/** + * Computes the likelihood based probability that haplotypes for first and second variant contexts + * only appear in their fully linked form (x11 and x22) given a set of haplotypes where they might occur + * and read likelihoods per sample + * + * User: depristo + * Date: 3/29/13 + * Time: 9:23 AM + */ +public class HaplotypeLDCalculator { + private final List haplotypes; + private final ReadLikelihoods readLikelihoods; + private List> haplotypeLikelihoodsPerSample = null; + + // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] + private final double[] table = new double[4]; + + /** + * For testing + */ + @SuppressWarnings("unchecked") + protected HaplotypeLDCalculator() { + haplotypes = Collections.emptyList(); + final AlleleList alleleList = AlleleListUtils.emptyList(); + readLikelihoods = new ReadLikelihoods<>(SampleListUtils.emptyList(), + alleleList, Collections.EMPTY_MAP); + } + + public HaplotypeLDCalculator(final List haplotypes, final ReadLikelihoods haplotypeReadMap) { + this.haplotypes = haplotypes; + this.readLikelihoods = haplotypeReadMap; + } + + /** + * Construct the cached list of summed haplotype likelihoods per sample if it + * hasn't already been computed. This data structure is lazy created but only + * needs to be made once when we make 1 merge decision as the data doesn't change + * no matter how many calls to computeProbOfBeingPhased + */ + private void buildHaplotypeLikelihoodsPerSampleIfNecessary() { + if ( haplotypeLikelihoodsPerSample == null ) { + // do the lazy computation + final Set samples = new LinkedHashSet<>(readLikelihoods.samples()); + haplotypeLikelihoodsPerSample = new LinkedList<>(); + for( final String sample : samples ) { + final Map map = new HashMap<>(haplotypes.size()); + for( final Haplotype h : haplotypes ) { + // count up the co-occurrences of the events for the R^2 calculation + final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, Collections.singletonList(h), false)[0][0]; + map.put(h, haplotypeLikelihood); + } + haplotypeLikelihoodsPerSample.add(map); + } + } + } + + /** + * Compute the likelihood based probability that that haplotypes for first and second are only x11 and x22 + * + * As opposed to the hypothesis that all four haplotypes (x11, x12, x21, and x22) exist in the population + * + * @param first a non-null VariantContext + * @param second a non-null VariantContext + * @return the probability that only x11 and x22 exist among the samples + */ + protected double computeProbOfBeingPhased(final VariantContext first, final VariantContext second) { + buildHaplotypeLikelihoodsPerSampleIfNecessary(); + + Arrays.fill(table, Double.NEGATIVE_INFINITY); + + for ( final Map entry : haplotypeLikelihoodsPerSample ) { + for ( final Map.Entry haplotypeLikelihood : entry.entrySet() ) { + final Haplotype h = haplotypeLikelihood.getKey(); + // count up the co-occurrences of the events for the R^2 calculation + final VariantContext thisHapVC = h.getEventMap().get(first.getStart()); + final VariantContext nextHapVC = h.getEventMap().get(second.getStart()); // TODO -- add function to take a VC + final int i = thisHapVC == null ? 0 : 1; + final int j = nextHapVC == null ? 0 : 1; + final int index = 2 * i + j; + table[index] = MathUtils.approximateLog10SumLog10(table[index], haplotypeLikelihood.getValue()); + } + } + + return pPhased(table); + } + + /** + * Compute probability that two variants are in phase with each other and that no + * compound hets exist in the population. + * + * Implemented as a likelihood ratio test of the hypothesis: + * + * x11 and x22 are the only haplotypes in the populations + * + * vs. + * + * all four haplotype combinations (x11, x12, x21, and x22) all exist in the population. + * + * Now, since we have to have both variants in the population, we exclude the x11 & x11 state. So the + * p of having just x11 and x22 is P(x11 & x22) + p(x22 & x22). + * + * Alternatively, we might have any configuration that gives us both 1 and 2 alts, which are: + * + * - P(x11 & x12 & x21) -- we have hom-ref and both hets + * - P(x22 & x12 & x21) -- we have hom-alt and both hets + * - P(x22 & x12) -- one haplotype is 22 and the other is het 12 + * - P(x22 & x21) -- one haplotype is 22 and the other is het 21 + * + * The probability is just p11_22 / (p11_22 + p hets) + * + * @param table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] + * doesn't have to be normalized as this function does the normalization internally + * @return the real space probability that the data is phased + */ + @Requires("table.length == 4") + protected double pPhased( double[] table ) { + final double[] normTable = MathUtils.normalizeFromLog10(table, true); + + final double x11 = normTable[0], x12 = normTable[1], x21 = normTable[2], x22 = normTable[3]; + + // probability that we are only x11 && x22 + final double p11_22 = MathUtils.approximateLog10SumLog10(x11 + x22, x22 + x22); + + // probability of having any of the other pairs + final double p11_12_21 = MathUtils.approximateLog10SumLog10(x11 + x12, x11 + x21, x12 + x21); + final double p22_12_21 = MathUtils.approximateLog10SumLog10(x22 + x12, x22 + x21, x12 + x21); + final double p22_12 = x22 + x12; + final double p22_21 = x22 + x21; + final double pOthers = MathUtils.approximateLog10SumLog10(new double[]{p11_12_21, p22_12_21, p22_12, p22_21}); + + // probability of being phases is the ratio of p11_22 / pOthers which in log space is just a substraction + final double log10phased = p11_22 - (MathUtils.approximateLog10SumLog10(p11_22, pOthers)); + + return Math.pow(10.0, log10phased); + } + + protected double pPhasedTest( final double x11, final double x12, final double x21, final double x22 ) { + return pPhased(new double[]{x11, x12, x21, x22}); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java index db80ac196..3dae25427 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeResolver.java @@ -56,9 +56,9 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMerger.java new file mode 100644 index 000000000..7d17e7502 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMerger.java @@ -0,0 +1,314 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.GenomeLoc; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; + +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.TreeSet; + +/** + * Merges VariantContexts in a series of haplotypes according to their pairwise LD + * + * User: depristo + * Date: 3/28/13 + * Time: 6:17 PM + */ +public class LDMerger extends MergeVariantsAcrossHaplotypes { + private final static Logger logger = Logger.getLogger(LDMerger.class); + + private final boolean DEBUG; + private final int minSamplesToMergeSNPs; + private final int minSamplesToMergeOtherEvents; + + public LDMerger(boolean DEBUG, int minSamplesToMergeSNPs, int minSamplesToMergeOtherEvents) { + super(); + this.DEBUG = DEBUG; + this.minSamplesToMergeSNPs = minSamplesToMergeSNPs; + this.minSamplesToMergeOtherEvents = minSamplesToMergeOtherEvents; + } + + protected LDMerger() { + this(false, 1, 1); + } + + // TODO -- should be class arguments and static variables in HC + protected final static int MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE = 6; + protected final static int MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE = 25; + + /** + * We require 99% confidence that only the phased haplotypes exist in the population to merge the records + */ + protected final static double MERGE_EVENTS_PROB_PHASED_THRESHOLD = 0.99; + + /** + * Merge as many events among the haplotypes as possible based on pairwise LD among variants + * + * @param haplotypes a list of haplotypes whose events we want to merge + * @param readLikelihoods map from sample name -> read likelihoods for each haplotype + * @param startPosKeySet a set of starting positions of all events among the haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + */ + @Override + public boolean merge( final List haplotypes, + final ReadLikelihoods readLikelihoods, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null"); + if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null"); + if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null"); + if ( ref == null ) throw new IllegalArgumentException("ref cannot be null"); + if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null"); + if ( refLoc.size() != ref.length ) throw new IllegalArgumentException("refLoc size " + refLoc.size() + " != ref.length " + ref.length + " at " + refLoc); + + if( startPosKeySet.size() <= 1 ) { return false; } + + final int nSamples = readLikelihoods.sampleCount(); + final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, readLikelihoods); + boolean somethingWasMerged = false; + boolean mapWasUpdated = true; + while( mapWasUpdated ) { + mapWasUpdated = mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calculator, nSamples, startPosKeySet, ref, refLoc); + somethingWasMerged |= mapWasUpdated; + } + return somethingWasMerged; + } + + /** + * Merge the next pair of events, if possible + * + * @param haplotypes a list of haplotypes whose events we want to merge + * @param ldCalculator calculates R^2 for pairs of events on demand + * @param startPosKeySet a set of starting positions of all events among the haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + * @return true if something was merged, false otherwise + */ + protected boolean mergeConsecutiveEventsBasedOnLDOnce( final List haplotypes, + final HaplotypeLDCalculator ldCalculator, + final int nSamples, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + // loop over the set of start locations and consider pairs that start near each other + final Iterator iter = startPosKeySet.iterator(); + int thisStart = iter.next(); + while( iter.hasNext() ) { + final int nextStart = iter.next(); + final LDMergeData toMerge = getPairOfEventsToMerge(haplotypes, thisStart, nextStart); + + if ( toMerge.canBeMerged(nSamples) ) { + final double pPhased = ldCalculator.computeProbOfBeingPhased(toMerge.firstVC, toMerge.secondVC); + + if( DEBUG ) { + logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", pPhased)); + logger.info("-- " + toMerge.firstVC); + logger.info("-- " + toMerge.secondVC); + } + + if( pPhased > MERGE_EVENTS_PROB_PHASED_THRESHOLD) { + final VariantContext mergedVC = createMergedVariantContext(toMerge.firstVC, toMerge.secondVC, ref, refLoc); + // if for some reason the merging resulting in a bad allele, mergedVC will be null, and we will just remove first and second + replaceVariantContextsInMap(haplotypes, startPosKeySet, mergedVC, toMerge.firstVC, toMerge.secondVC); + return true; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events + } + } + + thisStart = nextStart; + } + + return false; + } + + /** + * Info about potential LD merge of two variant contexts + */ + private class LDMergeData { + VariantContext firstVC = null, secondVC = null; + boolean canBeMerged = true; + + /** Tell this object that it cant be merged for some reason */ + public LDMergeData cantBeMerged() { + canBeMerged = false; + return this; + } + + /** + * Can these two events be merged + * @param nSamples the number of samples we're considering + * @return true if we can merge our two variant contexts + */ + public boolean canBeMerged(final int nSamples) { + if ( ! canBeMerged || firstVC == null || secondVC == null ) + return false; + + final int distance = secondVC.getStart() - firstVC.getEnd(); + if ( firstVC.isSNP() && secondVC.isSNP() ) { + return nSamples >= minSamplesToMergeSNPs && distance <= MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE; + } else { + return nSamples >= minSamplesToMergeOtherEvents && distance <= MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE; + } + } + } + + /** + * Get the information about the potential merge of two events starting at thisStart and nextStart + * @param haplotypes our haplotypes + * @param thisStart the starting position of the first event to merge + * @param nextStart the starting position of the next event to merge + * @return never {@code null}. + */ + private LDMergeData getPairOfEventsToMerge(final List haplotypes, final int thisStart, final int nextStart) { + final LDMergeData mergeData = new LDMergeData(); + + for( final Haplotype h : haplotypes ) { + // only make complex substitutions out of consecutive biallelic sites + final VariantContext thisHapVC = h.getEventMap().get(thisStart); + if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype + if( mergeData.firstVC == null ) { + mergeData.firstVC = thisHapVC; + } else if( !thisHapVC.hasSameAllelesAs( mergeData.firstVC) ) { + return mergeData.cantBeMerged(); + } + } + final VariantContext nextHapVC = h.getEventMap().get(nextStart); + if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype + if( mergeData.secondVC == null ) { + mergeData.secondVC = nextHapVC; + } else if( !nextHapVC.hasSameAllelesAs( mergeData.secondVC) ) { + return mergeData.cantBeMerged(); + } + } + } + + // don't try to merge overlapping events + if ( mergeData.firstVC != null && mergeData.secondVC != null && mergeData.firstVC.getEnd() >= mergeData.secondVC.getStart() ) + return mergeData.cantBeMerged(); + + return mergeData; + } + + // BUGBUG: make this merge function more general + protected VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) { + final int thisStart = thisVC.getStart(); + final int nextStart = nextVC.getStart(); + byte[] refBases = new byte[]{}; + byte[] altBases = new byte[]{}; + refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases()); + altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases()); + int locus; + for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) { + final byte refByte = ref[locus - refLoc.getStart()]; + refBases = ArrayUtils.add(refBases, refByte); + altBases = ArrayUtils.add(altBases, refByte); + } + refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel + altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); + + int iii = 0; + if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele + while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } + if ( iii == refBases.length ) { + // we've become a null allele, such as with CA/C + A/AA -> CA/CA => after trimming there's nothing left + // so return a null variant context so we can eliminate the variants from consideration + return null; + } + } + + + final Allele refAllele = Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ); + final Allele altAllele = Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ); + return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), Arrays.asList(refAllele, altAllele)).make(); + } + + /** + * Update the event maps in all haplotypes to replace a replacement of update1 and 2 with replacement + * + * @param haplotypes the haplotypes whose event maps we need to update + * @param startPosKeySet a sorted set of start positions that we must update + * @param replacement a VariantContext to replace update1 and update2 with. Can be null, indicating that we just want to remove update1 and update2 + * @param update1 the first VC we want to update + * @param update2 the second VC we want to update + */ + private void replaceVariantContextsInMap(final List haplotypes, + final TreeSet startPosKeySet, + final VariantContext replacement, + final VariantContext update1, final VariantContext update2) { + // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event + for( final Haplotype h : haplotypes ) { + // if we had both events, add replacement. In some cases the haplotype may not have both + // events but they were still merged because the haplotype isn't a particularly informative + // haplotype in any case. The order of operations here is important because we are modifying the map + final boolean shouldAdd = h.getEventMap().containsKey(update1.getStart()) && h.getEventMap().containsKey(update2.getStart()); + h.getEventMap().remove(update1.getStart()); + h.getEventMap().remove(update2.getStart()); + if ( shouldAdd && replacement != null ) { + h.getEventMap().addVC(replacement, false); // cannot merge we other events at the same position + } + } + + startPosKeySet.remove(update1.getStart()); + startPosKeySet.remove(update2.getStart()); + if ( replacement != null ) startPosKeySet.add(replacement.getStart()); + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/MergeVariantsAcrossHaplotypes.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/MergeVariantsAcrossHaplotypes.java new file mode 100644 index 000000000..7e42a8742 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/MergeVariantsAcrossHaplotypes.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; + +import java.util.List; +import java.util.TreeSet; + +/** + * Baseclass for code that wants to merge variants together in the haplotype caller + * + * This root class is basically a no-op, and can be used to not do any merging + */ +public class MergeVariantsAcrossHaplotypes { + /** + * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate + * + * @param haplotypes a list of haplotypes whose events we want to merge + * @param readLikelihoods map from sample name -> read likelihoods for each haplotype + * @param startPosKeySet a set of starting positions of all events among the haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + * @return true if anything was merged + */ + public boolean merge( final List haplotypes, + final ReadLikelihoods readLikelihoods, + final TreeSet startPosKeySet, + final byte[] ref, + final GenomeLoc refLoc ) { + return false; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index 3c8b0c40d..1b602604d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -56,17 +56,17 @@ import com.google.java.contract.Requires; import htsjdk.samtools.SAMUtils; import htsjdk.variant.variantcontext.Allele; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pairhmm.*; -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatCovariate; -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.File; @@ -78,8 +78,6 @@ import java.util.*; public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalculationEngine { private final static Logger logger = Logger.getLogger(PairHMMLikelihoodCalculationEngine.class); - public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual - private final byte constantGCP; private final double log10globalReadMismappingRate; @@ -189,7 +187,7 @@ public class PairHMMLikelihoodCalculationEngine implements ReadLikelihoodCalcula private void capMinimumReadQualities(GATKSAMRecord read, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals) { for( int kkk = 0; kkk < readQuals.length; kkk++ ) { readQuals[kkk] = (byte) Math.min( 0xff & readQuals[kkk], read.getMappingQuality()); // cap base quality by mapping quality, as in UG - readQuals[kkk] = ( readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); + readQuals[kkk] = ( readQuals[kkk] < PairHMM.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] ); readInsQuals[kkk] = ( readInsQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readInsQuals[kkk] ); readDelQuals[kkk] = ( readDelQuals[kkk] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : readDelQuals[kkk] ); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java index 1f75402b0..7a134a380 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java @@ -52,10 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; @@ -77,7 +77,7 @@ public class RandomLikelihoodCalculationEngine implements ReadLikelihoodCalculat final AlleleList haplotypes = new IndexedAlleleList<>(assemblyResultSet.getHaplotypeList()); final ReadLikelihoods result = new ReadLikelihoods(samples, haplotypes, reads); final Map alleles = new HashMap<>(haplotypes.alleleCount()); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final int sampleCount = samples.sampleCount(); final int alleleCount = haplotypes.alleleCount(); for (int i = 0; i < sampleCount; i++) { diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java index 14fc080b5..6dbcd161e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadLikelihoodCalculationEngine.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java index 0599ee880..5320cb52e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -55,14 +55,14 @@ import htsjdk.samtools.*; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFSimpleHeaderLine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.*; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.gatk.utils.pileup.PileupElement; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinderNode.java similarity index 100% rename from protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java rename to protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinderNode.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java index 0652a767c..9806a92e9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/IndelRealigner.java @@ -58,11 +58,10 @@ import htsjdk.samtools.util.StringUtil; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.BAQMode; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.utils.BaseUtils; @@ -80,7 +79,7 @@ import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.NWaySAMFileWriter; +import org.broadinstitute.gatk.engine.io.NWaySAMFileWriter; import org.broadinstitute.gatk.utils.sam.ReadUtils; import org.broadinstitute.gatk.utils.text.TextFormattingUtils; import org.broadinstitute.gatk.utils.text.XReadLines; @@ -403,7 +402,7 @@ public class IndelRealigner extends ReadWalker { throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); } - intervals = intervalsFile.getIntervals(getToolkit()).iterator(); + intervals = intervalsFile.getIntervals(getToolkit().getGenomeLocParser()).iterator(); currentInterval = intervals.hasNext() ? intervals.next() : null; @@ -991,7 +990,7 @@ public class IndelRealigner extends ReadWalker { else { int readsSeen = 0; while ( readsSeen++ < MAX_READS_FOR_CONSENSUSES && altConsensesToPopulate.size() <= MAX_CONSENSUSES) { - int index = GenomeAnalysisEngine.getRandomGenerator().nextInt(altAlignmentsToTest.size()); + int index = Utils.getRandomGenerator().nextInt(altAlignmentsToTest.size()); AlignedRead aRead = altAlignmentsToTest.remove(index); if ( CHECKEARLY ) createAndAddAlternateConsensus1(aRead, altConsensesToPopulate, reference,leftmostIndex); else createAndAddAlternateConsensus(aRead.getReadBases(), altConsensesToPopulate, reference); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java index ba1ad9db2..e6fd8d13b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/LeftAlignIndels.java @@ -55,9 +55,9 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java index 27abc48ad..ba9c985db 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/PairHMMIndelErrorModel.java @@ -53,10 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.indels; import com.google.java.contract.Ensures; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedAlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java index 69ef455d6..7384a70a9 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/indels/RealignerTargetCreator.java @@ -57,11 +57,11 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.filters.*; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java index f43d8377c..c01bb9cce 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/PhaseByTransmission.java @@ -56,15 +56,15 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Sample; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java index 0fe20e07d..e8ccaf842 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/phasing/ReadBackedPhasing.java @@ -58,17 +58,17 @@ import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.filters.MappingQualityZeroFilter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.HasGenomeLocation; -import org.broadinstitute.gatk.utils.SampleUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -82,7 +82,7 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; import java.io.*; import java.util.*; -import static org.broadinstitute.gatk.utils.variant.GATKVCFUtils.getVCFHeadersFromRods; +import static org.broadinstitute.gatk.engine.GATKVCFUtils.getVCFHeadersFromRods; /** * Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads). @@ -258,7 +258,7 @@ public class ReadBackedPhasing extends RodWalker vcfSamples = new TreeSet(samplesToPhase == null ? rodNameToHeader.get(trackName).getGenotypeSamples() : samplesToPhase); writer.writeHeader(new VCFHeader(hInfo, vcfSamples)); - Set readSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); + Set readSamples = ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); readSamples.retainAll(vcfSamples); if (readSamples.isEmpty()) { String noPhaseString = "No common samples in VCF and BAM headers" + (samplesToPhase == null ? "" : " (limited to sampleToPhase parameters)") + ", so nothing could possibly be phased!"; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java index 4236a1044..2e549263e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java @@ -52,21 +52,21 @@ package org.broadinstitute.gatk.tools.walkers.rnaseq; import htsjdk.samtools.*; +import org.broadinstitute.gatk.engine.io.NWaySAMFileWriter; import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.engine.iterators.RNAReadTransformer; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.engine.walkers.Requires; -import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.clipping.ReadClipper; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; @@ -156,7 +156,7 @@ public class SplitNCigarReads extends ReadWalker { // randomness related variables private static final long RANDOM_SEED = 1252863495; - private static final Random ran = GenomeAnalysisEngine.getRandomGenerator(); + private static final Random ran = Utils.getRandomGenerator(); private Poisson poissonRandom = null; // samples and read groups diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java index ac3dfcdd2..b1c25e44f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/GenotypeAndValidate.java @@ -57,13 +57,13 @@ import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.genotyper.*; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java index 1f5001455..6e4de0860 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/KeepAFSpectrumFrequencySelector.java @@ -51,12 +51,12 @@ package org.broadinstitute.gatk.tools.walkers.validation.validationsiteselector; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextUtils; +import org.broadinstitute.gatk.utils.Utils; import java.util.ArrayList; import java.util.Collections; @@ -162,13 +162,13 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector { // deal with rounding artifacts while (totalSites > numValidationSites) { // take off one from randomly selected bin - int k= GenomeAnalysisEngine.getRandomGenerator().nextInt(NUM_BINS); + int k= Utils.getRandomGenerator().nextInt(NUM_BINS); sitesToChoosePerBin[k]--; totalSites--; } while (totalSites < numValidationSites) { // take off one from randomly selected bin - int k= GenomeAnalysisEngine.getRandomGenerator().nextInt( NUM_BINS); + int k= Utils.getRandomGenerator().nextInt( NUM_BINS); sitesToChoosePerBin[k]++; totalSites++; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java index 744dd0623..e31188beb 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/validation/validationsiteselector/ValidationSiteSelector.java @@ -53,17 +53,17 @@ package org.broadinstitute.gatk.tools.walkers.validation.validationsiteselector; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.writer.VariantContextWriter; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java index 2c037bee1..8ade061c8 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/ApplyRecalibration.java @@ -53,16 +53,16 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.PartitionBy; import org.broadinstitute.gatk.engine.walkers.PartitionType; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java index 08a5865d7..1eb555f2c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/GaussianMixtureModel.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import Jama.Matrix; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; import java.util.ArrayList; import java.util.Arrays; @@ -101,7 +101,7 @@ public class GaussianMixtureModel { // initialize random Gaussian means // BUGBUG: this is broken up this way to match the order of calls to rand.nextDouble() in the old code for( final MultivariateGaussian gaussian : gaussians ) { - gaussian.initializeRandomMu( GenomeAnalysisEngine.getRandomGenerator() ); + gaussian.initializeRandomMu( Utils.getRandomGenerator() ); } // initialize means using K-means algorithm @@ -112,7 +112,7 @@ public class GaussianMixtureModel { for( final MultivariateGaussian gaussian : gaussians ) { gaussian.pMixtureLog10 = Math.log10( 1.0 / ((double) gaussians.size()) ); gaussian.sumProb = 1.0 / ((double) gaussians.size()); - gaussian.initializeRandomSigma( GenomeAnalysisEngine.getRandomGenerator() ); + gaussian.initializeRandomSigma( Utils.getRandomGenerator() ); gaussian.hyperParameter_a = priorCounts; gaussian.hyperParameter_b = shrinkage; gaussian.hyperParameter_lambda = dirichletParameter; @@ -152,7 +152,7 @@ public class GaussianMixtureModel { if( numAssigned != 0 ) { gaussian.divideEqualsMu( ((double) numAssigned) ); } else { - gaussian.initializeRandomMu( GenomeAnalysisEngine.getRandomGenerator() ); + gaussian.initializeRandomMu( Utils.getRandomGenerator() ); } } } @@ -279,7 +279,7 @@ public class GaussianMixtureModel { // if it is missing marginalize over the missing dimension by drawing X random values for the missing annotation and averaging the lod if( datum.isNull[iii] ) { for( int ttt = 0; ttt < numIterPerMissingAnnotation; ttt++ ) { - datum.annotations[iii] = GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution + datum.annotations[iii] = Utils.getRandomGenerator().nextGaussian(); // draw a random sample from the standard normal distribution // evaluate this random data point int gaussianIndex = 0; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java index 023d64f7f..2be4ec01d 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantDataManager.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.vcf.VCFConstants; @@ -116,7 +116,7 @@ public class VariantDataManager { varianceVector[iii] = theSTD; for( final VariantDatum datum : data ) { // Transform each data point via: (x - mean) / standard deviation - datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); + datum.annotations[iii] = ( datum.isNull[iii] ? 0.1 * Utils.getRandomGenerator().nextGaussian() : ( datum.annotations[iii] - theMean ) / theSTD ); } } if( foundZeroVarianceAnnotation ) { @@ -251,7 +251,7 @@ public class VariantDataManager { logger.warn( "WARNING: Training with very few variant sites! Please check the model reporting PDF to ensure the quality of the model is reliable." ); } else if( trainingData.size() > VRAC.MAX_NUM_TRAINING_DATA ) { logger.warn( "WARNING: Very large training set detected. Downsampling to " + VRAC.MAX_NUM_TRAINING_DATA + " training variants." ); - Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(trainingData, Utils.getRandomGenerator()); return trainingData.subList(0, VRAC.MAX_NUM_TRAINING_DATA); } return trainingData; @@ -299,13 +299,13 @@ public class VariantDataManager { public List getRandomDataForPlotting( final int numToAdd, final List trainingData, final List antiTrainingData, final List evaluationData ) { final List returnData = new ExpandingArrayList<>(); - Collections.shuffle(trainingData, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(antiTrainingData, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(evaluationData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(trainingData, Utils.getRandomGenerator()); + Collections.shuffle(antiTrainingData, Utils.getRandomGenerator()); + Collections.shuffle(evaluationData, Utils.getRandomGenerator()); returnData.addAll(trainingData.subList(0, Math.min(numToAdd, trainingData.size()))); returnData.addAll(antiTrainingData.subList(0, Math.min(numToAdd, antiTrainingData.size()))); returnData.addAll(evaluationData.subList(0, Math.min(numToAdd, evaluationData.size()))); - Collections.shuffle(returnData, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(returnData, Utils.getRandomGenerator()); return returnData; } @@ -349,10 +349,10 @@ public class VariantDataManager { try { value = vc.getAttributeAsDouble( annotationKey, Double.NaN ); if( Double.isInfinite(value) ) { value = Double.NaN; } - if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } - if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, 0.01) == 0 ) { value += 0.01 * GenomeAnalysisEngine.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln + if( jitter && annotationKey.equalsIgnoreCase("HaplotypeScore") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("FS") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("InbreedingCoeff") && MathUtils.compareDoubles(value, 0.0, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } + if( jitter && annotationKey.equalsIgnoreCase("SOR") && MathUtils.compareDoubles(value, LOG_OF_TWO, 0.01) == 0 ) { value += 0.01 * Utils.getRandomGenerator().nextGaussian(); } //min SOR is 2.0, then we take ln } catch( Exception e ) { value = Double.NaN; // The VQSR works with missing data by marginalizing over the missing dimension when evaluating the Gaussian mixture model } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java index 1cdcb5f5f..e5c7c248b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrator.java @@ -53,10 +53,9 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.PartitionBy; import org.broadinstitute.gatk.engine.walkers.PartitionType; import org.broadinstitute.gatk.engine.walkers.RodWalker; @@ -284,7 +283,7 @@ public class VariantRecalibrator extends RodWalker array ) { + return MathUtils.median(array); // right now we take the median but other options could be explored + } + + /** + * Merges VariantContexts from gVCFs into a single hybrid. + * Assumes that none of the input records are filtered. + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case + * @param removeNonRefSymbolicAllele if true, remove the allele from the merged VC + * @return new VariantContext representing the merge of all VCs or null if it not relevant + */ + public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele) { + // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs + if ( VCs == null || VCs.size() == 0 ) + return null; + + // establish the baseline info (sometimes from the first VC) + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + + // ref allele + final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase); + if ( refAllele == null ) + return null; + + // FinalAlleleSet contains the alleles of the new resulting VC + // Using linked set in order to guarantee a stable order + final LinkedHashSet finalAlleleSet = new LinkedHashSet<>(10); + // Reference goes first + finalAlleleSet.add(refAllele); + + final Map attributes = new LinkedHashMap<>(); + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + int depth = 0; + final Map> annotationMap = new LinkedHashMap<>(); + final GenotypesContext genotypes = GenotypesContext.create(); + + final int variantContextCount = VCs.size(); + // In this list we hold the mapping of each variant context alleles. + final List>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount); + // cycle through and add info from the other VCs + for ( final VariantContext vc : VCs ) { + + // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) + final boolean isSpanningEvent = loc.getStart() != vc.getStart(); + + vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) + : remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet))); + } + + // Add to the end if at all required in in the output. + if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + + final List allelesList = new ArrayList<>(finalAlleleSet); + + for ( final Pair> pair : vcAndNewAllelePairs ) { + final VariantContext vc = pair.getFirst(); + final List remappedAlleles = pair.getSecond(); + + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList); + + // special case DP (add it up) for all events + if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + } else { // handle the gVCF case from the HaplotypeCaller + for( final Genotype gt : vc.getGenotypes() ) { + depth += (gt.hasExtendedAttribute("MIN_DP") ? Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) : (gt.hasDP() ? gt.getDP() : 0)); + } + } + + if ( loc.getStart() != vc.getStart() ) + continue; + + // special case ID (just preserve it) + if ( vc.hasID() ) rsIDs.add(vc.getID()); + + // add attributes + addReferenceConfidenceAttributes(vc.getAttributes(), annotationMap); + } + + // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + // remove stale AC and AF based attributes + removeStaleAttributesAfterMerge(attributes); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) + .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) + .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later + + return builder.make(); + } + + /** + * Determines the ref allele given the provided reference base at this position + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning + * @return new Allele or null if no reference allele/base is available + */ + private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { + final Allele refAllele = GATKVariantContextUtils.determineReferenceAllele(VCs, loc); + if ( refAllele == null ) + return ( refBase == null ? null : Allele.create(refBase, true) ); + return refAllele; + } + + /** + * Remove the stale attributes from the merged set + * + * @param attributes the attribute map + */ + private static void removeStaleAttributesAfterMerge(final Map attributes) { + attributes.remove(VCFConstants.ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_COUNT_KEY); + attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + attributes.remove(VCFConstants.END_KEY); + } + + /** + * Adds attributes to the global map from the new context in a sophisticated manner + * + * @param myAttributes attributes to add from + * @param annotationMap map of annotations for combining later + */ + private static void addReferenceConfidenceAttributes(final Map myAttributes, + final Map> annotationMap) { + for ( final Map.Entry p : myAttributes.entrySet() ) { + final String key = p.getKey(); + final Object value = p.getValue(); + + // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); + } catch (final NumberFormatException e) { + // nothing to do + } + } + } + + /** + * This method does a couple of things: + *
  • + * remaps the vc alleles considering the differences between the final reference allele and its own reference,
  • + *
  • + * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. + *
+ * + * @param vcAlleles the variant context allele list. + * @param refAllele final reference allele. + * @param finalAlleles where to add the final set of non-ref called alleles. + * @return never {@code null} + */ + //TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}. + private static List remapAlleles(final List vcAlleles, final Allele refAllele, final LinkedHashSet finalAlleles) { + final Allele vcRef = vcAlleles.get(0); + if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference"); + final byte[] refBases = refAllele.getBases(); + final int extraBaseCount = refBases.length - vcRef.getBases().length; + if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); + final List result = new ArrayList<>(vcAlleles.size()); + + for (final Allele a : vcAlleles) { + if (a.isReference()) { + result.add(refAllele); + } else if (a.isSymbolic()) { + result.add(a); + // we always skip when adding to finalAlleles this is done outside if applies. + if (!a.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)) + finalAlleles.add(a); + } else if (a.isCalled()) { + final Allele newAllele; + if (extraBaseCount > 0) { + final byte[] oldBases = a.getBases(); + final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount); + System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount); + newAllele = Allele.create(newBases,false); + } else + newAllele = a; + result.add(newAllele); + finalAlleles.add(newAllele); + } else { // NO_CALL and strange miscellanea + result.add(a); + } + } + return result; + } + + /** + * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele + * + * @param alleles list of alleles to replace + * @return non-null list of alleles + */ + private static List replaceWithNoCalls(final List alleles) { + if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); + + final List result = new ArrayList<>(alleles.size()); + for ( final Allele allele : alleles ) + result.add(allele.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); + return result; + } + + /** + * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. + * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. + * + * @param mergedGenotypes the genotypes context to add to + * @param VC the Variant Context for the sample + * @param remappedAlleles the list of remapped alleles for the sample + * @param targetAlleles the list of target alleles + */ + private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, + final VariantContext VC, + final List remappedAlleles, + final List targetAlleles) { + final int maximumPloidy = VC.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); + // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) + // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. + final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; + final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); + final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart()); + + for ( final Genotype g : VC.getGenotypes() ) { + final String name = g.getSampleName(); + if ( mergedGenotypes.containsSample(name) ) + continue; + final int ploidy = g.getPloidy(); + final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); + if (g.hasPL()) { + // lazy initialization of the genotype index map by ploidy. + final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null + ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles) + : genotypeIndexMapsByPloidy[ploidy]; + final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); + final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null; + genotypeBuilder.PL(PLs).AD(AD).noGQ(); + } + mergedGenotypes.add(genotypeBuilder.make()); + } + } + + /** + * Composes a new likelihood array given the original genotype and the genotype index map. + * + * @param g the original genotype. + * @param genotypeIndexMapByPloidy genotype index map. The ith element indicates what genotype in {@code g} corresponds + * to the ith genotype in the return likelihoods array. + * + * @throws NullPointerException if {@code g} or {@code genotypeIndexMapByPloidy} is {@code null}, or if {@code g} + * does not contain likelihoods. + * @throws IndexOutOfBoundsException if {@code genotypeIndexMapByPloidy} contain non valid + * genotype indices given the likelihood array in {@code g}. + * + * @return never {@code null} but an array of exactly {@code genotypeIndexMapByPloidy.length} positions. + */ + private static int[] generatePL(final Genotype g, final int[] genotypeIndexMapByPloidy) { + final int[] PLs = new int[genotypeIndexMapByPloidy.length]; + final int[] oldPLs = g.getPL(); + for (int i = 0; i < PLs.length; i++) + PLs[i] = oldPLs[genotypeIndexMapByPloidy[i]]; + return PLs; + } + + /** + * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. + * If the myAlleles set does not contain "" as an allele, it throws an exception. + * + * @param remappedAlleles the list of alleles to evaluate + * @param targetAlleles the target list of alleles + * @param position position to use for error messages + * @return non-null array of ints representing indexes + */ + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) { + + if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); + if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); + + if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) + throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); + final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + + final int[] indexMapping = new int[targetAlleles.size()]; + + // the reference alleles always match up (even if they don't appear to) + indexMapping[0] = 0; + + // create the index mapping, using the allele whenever such a mapping doesn't exist + for ( int i = 1; i < targetAlleles.size(); i++ ) { + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele; + } + + return indexMapping; + } + + /** + * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current + * alleles from the original AD. + * + * @param originalAD the original AD to extend + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new AD values + */ + protected static int[] generateAD(final int[] originalAD, final int[] indexesOfRelevantAlleles) { + if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null"); + + final int numADs = indexesOfRelevantAlleles.length; + final int[] newAD = new int[numADs]; + + for ( int i = 0; i < numADs; i++ ) { + final int oldIndex = indexesOfRelevantAlleles[i]; + if ( oldIndex >= originalAD.length ) + newAD[i] = 0; + else + newAD[i] = originalAD[oldIndex]; + } + + return newAD; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java index 136d3d67e..8b94a56a6 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RegenotypeVariants.java @@ -52,24 +52,24 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorProvider; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.tools.walkers.genotyper.*; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java index 3d501a71c..134842bcc 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/gga/GenotypingGivenAllelesUtils.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.utils.gga; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java deleted file mode 100644 index 57f823dac..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculator.java +++ /dev/null @@ -1,204 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import com.google.java.contract.Requires; -import htsjdk.variant.variantcontext.VariantContext; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUtils; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; - -import java.util.*; - -/** - * Computes the likelihood based probability that haplotypes for first and second variant contexts - * only appear in their fully linked form (x11 and x22) given a set of haplotypes where they might occur - * and read likelihoods per sample - * - * User: depristo - * Date: 3/29/13 - * Time: 9:23 AM - */ -public class HaplotypeLDCalculator { - private final List haplotypes; - private final ReadLikelihoods readLikelihoods; - private List> haplotypeLikelihoodsPerSample = null; - - // linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] - private final double[] table = new double[4]; - - /** - * For testing - */ - @SuppressWarnings("unchecked") - protected HaplotypeLDCalculator() { - haplotypes = Collections.emptyList(); - final AlleleList alleleList = AlleleListUtils.emptyList(); - readLikelihoods = new ReadLikelihoods<>(SampleListUtils.emptyList(), - alleleList, Collections.EMPTY_MAP); - } - - public HaplotypeLDCalculator(final List haplotypes, final ReadLikelihoods haplotypeReadMap) { - this.haplotypes = haplotypes; - this.readLikelihoods = haplotypeReadMap; - } - - /** - * Construct the cached list of summed haplotype likelihoods per sample if it - * hasn't already been computed. This data structure is lazy created but only - * needs to be made once when we make 1 merge decision as the data doesn't change - * no matter how many calls to computeProbOfBeingPhased - */ - private void buildHaplotypeLikelihoodsPerSampleIfNecessary() { - if ( haplotypeLikelihoodsPerSample == null ) { - // do the lazy computation - final Set samples = new LinkedHashSet<>(readLikelihoods.samples()); - haplotypeLikelihoodsPerSample = new LinkedList<>(); - for( final String sample : samples ) { - final Map map = new HashMap<>(haplotypes.size()); - for( final Haplotype h : haplotypes ) { - // count up the co-occurrences of the events for the R^2 calculation - final double haplotypeLikelihood = PairHMMLikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, readLikelihoods, Collections.singletonList(h), false)[0][0]; - map.put(h, haplotypeLikelihood); - } - haplotypeLikelihoodsPerSample.add(map); - } - } - } - - /** - * Compute the likelihood based probability that that haplotypes for first and second are only x11 and x22 - * - * As opposed to the hypothesis that all four haplotypes (x11, x12, x21, and x22) exist in the population - * - * @param first a non-null VariantContext - * @param second a non-null VariantContext - * @return the probability that only x11 and x22 exist among the samples - */ - protected double computeProbOfBeingPhased(final VariantContext first, final VariantContext second) { - buildHaplotypeLikelihoodsPerSampleIfNecessary(); - - Arrays.fill(table, Double.NEGATIVE_INFINITY); - - for ( final Map entry : haplotypeLikelihoodsPerSample ) { - for ( final Map.Entry haplotypeLikelihood : entry.entrySet() ) { - final Haplotype h = haplotypeLikelihood.getKey(); - // count up the co-occurrences of the events for the R^2 calculation - final VariantContext thisHapVC = h.getEventMap().get(first.getStart()); - final VariantContext nextHapVC = h.getEventMap().get(second.getStart()); // TODO -- add function to take a VC - final int i = thisHapVC == null ? 0 : 1; - final int j = nextHapVC == null ? 0 : 1; - final int index = 2 * i + j; - table[index] = MathUtils.approximateLog10SumLog10(table[index], haplotypeLikelihood.getValue()); - } - } - - return pPhased(table); - } - - /** - * Compute probability that two variants are in phase with each other and that no - * compound hets exist in the population. - * - * Implemented as a likelihood ratio test of the hypothesis: - * - * x11 and x22 are the only haplotypes in the populations - * - * vs. - * - * all four haplotype combinations (x11, x12, x21, and x22) all exist in the population. - * - * Now, since we have to have both variants in the population, we exclude the x11 & x11 state. So the - * p of having just x11 and x22 is P(x11 & x22) + p(x22 & x22). - * - * Alternatively, we might have any configuration that gives us both 1 and 2 alts, which are: - * - * - P(x11 & x12 & x21) -- we have hom-ref and both hets - * - P(x22 & x12 & x21) -- we have hom-alt and both hets - * - P(x22 & x12) -- one haplotype is 22 and the other is het 12 - * - P(x22 & x21) -- one haplotype is 22 and the other is het 21 - * - * The probability is just p11_22 / (p11_22 + p hets) - * - * @param table linear contigency table with table[0] == [0][0], table[1] = [0][1], table[2] = [1][0], table[3] = [1][1] - * doesn't have to be normalized as this function does the normalization internally - * @return the real space probability that the data is phased - */ - @Requires("table.length == 4") - protected double pPhased( double[] table ) { - final double[] normTable = MathUtils.normalizeFromLog10(table, true); - - final double x11 = normTable[0], x12 = normTable[1], x21 = normTable[2], x22 = normTable[3]; - - // probability that we are only x11 && x22 - final double p11_22 = MathUtils.approximateLog10SumLog10(x11 + x22, x22 + x22); - - // probability of having any of the other pairs - final double p11_12_21 = MathUtils.approximateLog10SumLog10(x11 + x12, x11 + x21, x12 + x21); - final double p22_12_21 = MathUtils.approximateLog10SumLog10(x22 + x12, x22 + x21, x12 + x21); - final double p22_12 = x22 + x12; - final double p22_21 = x22 + x21; - final double pOthers = MathUtils.approximateLog10SumLog10(new double[]{p11_12_21, p22_12_21, p22_12, p22_21}); - - // probability of being phases is the ratio of p11_22 / pOthers which in log space is just a substraction - final double log10phased = p11_22 - (MathUtils.approximateLog10SumLog10(p11_22, pOthers)); - - return Math.pow(10.0, log10phased); - } - - protected double pPhasedTest( final double x11, final double x12, final double x21, final double x22 ) { - return pPhased(new double[]{x11, x12, x21, x22}); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java deleted file mode 100644 index 53b4cff58..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/LDMerger.java +++ /dev/null @@ -1,313 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.GenomeLoc; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; - -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.TreeSet; - -/** - * Merges VariantContexts in a series of haplotypes according to their pairwise LD - * - * User: depristo - * Date: 3/28/13 - * Time: 6:17 PM - */ -public class LDMerger extends MergeVariantsAcrossHaplotypes { - private final static Logger logger = Logger.getLogger(LDMerger.class); - - private final boolean DEBUG; - private final int minSamplesToMergeSNPs; - private final int minSamplesToMergeOtherEvents; - - public LDMerger(boolean DEBUG, int minSamplesToMergeSNPs, int minSamplesToMergeOtherEvents) { - super(); - this.DEBUG = DEBUG; - this.minSamplesToMergeSNPs = minSamplesToMergeSNPs; - this.minSamplesToMergeOtherEvents = minSamplesToMergeOtherEvents; - } - - protected LDMerger() { - this(false, 1, 1); - } - - // TODO -- should be class arguments and static variables in HC - protected final static int MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE = 6; - protected final static int MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE = 25; - - /** - * We require 99% confidence that only the phased haplotypes exist in the population to merge the records - */ - protected final static double MERGE_EVENTS_PROB_PHASED_THRESHOLD = 0.99; - - /** - * Merge as many events among the haplotypes as possible based on pairwise LD among variants - * - * @param haplotypes a list of haplotypes whose events we want to merge - * @param readLikelihoods map from sample name -> read likelihoods for each haplotype - * @param startPosKeySet a set of starting positions of all events among the haplotypes - * @param ref the reference bases - * @param refLoc the span of the reference bases - */ - @Override - public boolean merge( final List haplotypes, - final ReadLikelihoods readLikelihoods, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - if ( haplotypes == null ) throw new IllegalArgumentException("haplotypes cannot be null"); - if ( readLikelihoods == null ) throw new IllegalArgumentException("readLikelihoods cannot be null"); - if ( startPosKeySet == null ) throw new IllegalArgumentException("startPosKeySet cannot be null"); - if ( ref == null ) throw new IllegalArgumentException("ref cannot be null"); - if ( refLoc == null ) throw new IllegalArgumentException("refLoc cannot be null"); - if ( refLoc.size() != ref.length ) throw new IllegalArgumentException("refLoc size " + refLoc.size() + " != ref.length " + ref.length + " at " + refLoc); - - if( startPosKeySet.size() <= 1 ) { return false; } - - final int nSamples = readLikelihoods.sampleCount(); - final HaplotypeLDCalculator r2Calculator = new HaplotypeLDCalculator(haplotypes, readLikelihoods); - boolean somethingWasMerged = false; - boolean mapWasUpdated = true; - while( mapWasUpdated ) { - mapWasUpdated = mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calculator, nSamples, startPosKeySet, ref, refLoc); - somethingWasMerged |= mapWasUpdated; - } - return somethingWasMerged; - } - - /** - * Merge the next pair of events, if possible - * - * @param haplotypes a list of haplotypes whose events we want to merge - * @param ldCalculator calculates R^2 for pairs of events on demand - * @param startPosKeySet a set of starting positions of all events among the haplotypes - * @param ref the reference bases - * @param refLoc the span of the reference bases - * @return true if something was merged, false otherwise - */ - protected boolean mergeConsecutiveEventsBasedOnLDOnce( final List haplotypes, - final HaplotypeLDCalculator ldCalculator, - final int nSamples, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - // loop over the set of start locations and consider pairs that start near each other - final Iterator iter = startPosKeySet.iterator(); - int thisStart = iter.next(); - while( iter.hasNext() ) { - final int nextStart = iter.next(); - final LDMergeData toMerge = getPairOfEventsToMerge(haplotypes, thisStart, nextStart); - - if ( toMerge.canBeMerged(nSamples) ) { - final double pPhased = ldCalculator.computeProbOfBeingPhased(toMerge.firstVC, toMerge.secondVC); - - if( DEBUG ) { - logger.info("Found consecutive biallelic events with R^2 = " + String.format("%.4f", pPhased)); - logger.info("-- " + toMerge.firstVC); - logger.info("-- " + toMerge.secondVC); - } - - if( pPhased > MERGE_EVENTS_PROB_PHASED_THRESHOLD) { - final VariantContext mergedVC = createMergedVariantContext(toMerge.firstVC, toMerge.secondVC, ref, refLoc); - // if for some reason the merging resulting in a bad allele, mergedVC will be null, and we will just remove first and second - replaceVariantContextsInMap(haplotypes, startPosKeySet, mergedVC, toMerge.firstVC, toMerge.secondVC); - return true; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events - } - } - - thisStart = nextStart; - } - - return false; - } - - /** - * Info about potential LD merge of two variant contexts - */ - private class LDMergeData { - VariantContext firstVC = null, secondVC = null; - boolean canBeMerged = true; - - /** Tell this object that it cant be merged for some reason */ - public LDMergeData cantBeMerged() { - canBeMerged = false; - return this; - } - - /** - * Can these two events be merged - * @param nSamples the number of samples we're considering - * @return true if we can merge our two variant contexts - */ - public boolean canBeMerged(final int nSamples) { - if ( ! canBeMerged || firstVC == null || secondVC == null ) - return false; - - final int distance = secondVC.getStart() - firstVC.getEnd(); - if ( firstVC.isSNP() && secondVC.isSNP() ) { - return nSamples >= minSamplesToMergeSNPs && distance <= MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE; - } else { - return nSamples >= minSamplesToMergeOtherEvents && distance <= MAX_DISTANCE_BETWEEN_OTHER_EVENTS_TO_MERGE; - } - } - } - - /** - * Get the information about the potential merge of two events starting at thisStart and nextStart - * @param haplotypes our haplotypes - * @param thisStart the starting position of the first event to merge - * @param nextStart the starting position of the next event to merge - * @return never {@code null}. - */ - private LDMergeData getPairOfEventsToMerge(final List haplotypes, final int thisStart, final int nextStart) { - final LDMergeData mergeData = new LDMergeData(); - - for( final Haplotype h : haplotypes ) { - // only make complex substitutions out of consecutive biallelic sites - final VariantContext thisHapVC = h.getEventMap().get(thisStart); - if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype - if( mergeData.firstVC == null ) { - mergeData.firstVC = thisHapVC; - } else if( !thisHapVC.hasSameAllelesAs( mergeData.firstVC) ) { - return mergeData.cantBeMerged(); - } - } - final VariantContext nextHapVC = h.getEventMap().get(nextStart); - if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype - if( mergeData.secondVC == null ) { - mergeData.secondVC = nextHapVC; - } else if( !nextHapVC.hasSameAllelesAs( mergeData.secondVC) ) { - return mergeData.cantBeMerged(); - } - } - } - - // don't try to merge overlapping events - if ( mergeData.firstVC != null && mergeData.secondVC != null && mergeData.firstVC.getEnd() >= mergeData.secondVC.getStart() ) - return mergeData.cantBeMerged(); - - return mergeData; - } - - // BUGBUG: make this merge function more general - protected VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) { - final int thisStart = thisVC.getStart(); - final int nextStart = nextVC.getStart(); - byte[] refBases = new byte[]{}; - byte[] altBases = new byte[]{}; - refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases()); - altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases()); - int locus; - for( locus = thisStart + refBases.length; locus < nextStart; locus++ ) { - final byte refByte = ref[locus - refLoc.getStart()]; - refBases = ArrayUtils.add(refBases, refByte); - altBases = ArrayUtils.add(altBases, refByte); - } - refBases = ArrayUtils.addAll(refBases, ArrayUtils.subarray(nextVC.getReference().getBases(), locus > nextStart ? 1 : 0, nextVC.getReference().getBases().length)); // special case of deletion including the padding base of consecutive indel - altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases()); - - int iii = 0; - if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele - while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } - if ( iii == refBases.length ) { - // we've become a null allele, such as with CA/C + A/AA -> CA/CA => after trimming there's nothing left - // so return a null variant context so we can eliminate the variants from consideration - return null; - } - } - - - final Allele refAllele = Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ); - final Allele altAllele = Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ); - return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), Arrays.asList(refAllele, altAllele)).make(); - } - - /** - * Update the event maps in all haplotypes to replace a replacement of update1 and 2 with replacement - * - * @param haplotypes the haplotypes whose event maps we need to update - * @param startPosKeySet a sorted set of start positions that we must update - * @param replacement a VariantContext to replace update1 and update2 with. Can be null, indicating that we just want to remove update1 and update2 - * @param update1 the first VC we want to update - * @param update2 the second VC we want to update - */ - private void replaceVariantContextsInMap(final List haplotypes, - final TreeSet startPosKeySet, - final VariantContext replacement, - final VariantContext update1, final VariantContext update2) { - // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event - for( final Haplotype h : haplotypes ) { - // if we had both events, add replacement. In some cases the haplotype may not have both - // events but they were still merged because the haplotype isn't a particularly informative - // haplotype in any case. The order of operations here is important because we are modifying the map - final boolean shouldAdd = h.getEventMap().containsKey(update1.getStart()) && h.getEventMap().containsKey(update2.getStart()); - h.getEventMap().remove(update1.getStart()); - h.getEventMap().remove(update2.getStart()); - if ( shouldAdd && replacement != null ) { - h.getEventMap().addVC(replacement, false); // cannot merge we other events at the same position - } - } - - startPosKeySet.remove(update1.getStart()); - startPosKeySet.remove(update2.getStart()); - if ( replacement != null ) startPosKeySet.add(replacement.getStart()); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java deleted file mode 100644 index 403e02988..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotype/MergeVariantsAcrossHaplotypes.java +++ /dev/null @@ -1,83 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; - -import java.util.List; -import java.util.TreeSet; - -/** - * Baseclass for code that wants to merge variants together in the haplotype caller - * - * This root class is basically a no-op, and can be used to not do any merging - */ -public class MergeVariantsAcrossHaplotypes { - /** - * Merge variants across the haplotypes, updating the haplotype event maps and startPos set as appropriate - * - * @param haplotypes a list of haplotypes whose events we want to merge - * @param readLikelihoods map from sample name -> read likelihoods for each haplotype - * @param startPosKeySet a set of starting positions of all events among the haplotypes - * @param ref the reference bases - * @param refLoc the span of the reference bases - * @return true if anything was merged - */ - public boolean merge( final List haplotypes, - final ReadLikelihoods readLikelihoods, - final TreeSet startPosKeySet, - final byte[] ref, - final GenomeLoc refLoc ) { - return false; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index b3c096522..4edfd4f0f 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.utils.haplotypeBAMWriter; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMTag; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java index ec876bd03..567a3635e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/haplotypeBAMWriter/ReadDestination.java @@ -54,7 +54,7 @@ package org.broadinstitute.gatk.utils.haplotypeBAMWriter; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.util.ArrayList; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java index cf5168f44..28add0f64 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMM.java @@ -51,7 +51,6 @@ package org.broadinstitute.gatk.utils.pairhmm; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.PairHMMLikelihoodCalculationEngine; import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; @@ -398,7 +397,7 @@ public class FastLoglessPairHMM extends LoglessPairHMM implements FlexibleHMM { for (int kkk = 0; kkk < readQuals.length; kkk++) { readQuals[kkk] = (byte) Math.min(0xff & readQuals[kkk], mq); // cap base quality by mapping - readQuals[kkk] = (byte) (readQuals[kkk] < PairHMMLikelihoodCalculationEngine.BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE + readQuals[kkk] = (byte) (readQuals[kkk] < BASE_QUALITY_SCORE_THRESHOLD ? QualityUtils.MIN_USABLE_Q_SCORE : Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readQuals[kkk])); readInsQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readInsQuals[kkk]); readDelQuals[kkk] = (byte) Math.max(QualityUtils.MIN_USABLE_Q_SCORE,readDelQuals[kkk]); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRReadTransformer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRReadTransformer.java deleted file mode 100644 index 5b342c8be..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRReadTransformer.java +++ /dev/null @@ -1,104 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.WalkerManager; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/** - * A ReadTransformer that applies BQSR on the fly to reads - * - * User: rpoplin - * Date: 2/13/12 - */ -public class BQSRReadTransformer extends ReadTransformer { - private boolean enabled; - private BaseRecalibration bqsr = null; - - @Override - public OrderingConstraint getOrderingConstraint() { return OrderingConstraint.MUST_BE_FIRST; } - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { - this.enabled = engine.hasBQSRArgumentSet(); - if ( enabled ) { - // TODO -- See important note below about applying BQSR to a reduced BAM file: - // If it is important to make sure that BQSR is not applied (as opposed to having the covariates computed) against a reduced bam file, - // we need to figure out how to make this work. The problem is that the ReadTransformers are initialized before the ReadDataSource - // inside the GenomeAnalysisEngine, so we generate a NPE when trying to retrieve the SAMFileHeaders. Ultimately, I don't think this is - // a necessary check anyways since we disallow running BaseRecalibrator on reduced bams (so we can't generate the recal tables to use here). - // Although we could add this check to the apply() method below, it's kind of ugly and inefficient. - // The call here would be: RecalUtils.checkForInvalidRecalBams(engine.getSAMFileHeaders(), engine.getArguments().ALLOW_BQSR_ON_REDUCED_BAMS); - final BQSRArgumentSet args = engine.getBQSRArgumentSet(); - this.bqsr = new BaseRecalibration(args.getRecalFile(), args.getQuantizationLevels(), args.shouldDisableIndelQuals(), args.getPreserveQscoresLessThan(), args.shouldEmitOriginalQuals(), args.getGlobalQScorePrior()); - } - final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); - return mode.ApplicationTime(); - } - - @Override - public boolean enabled() { - return enabled; - } - - /** - * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. - */ - @Override - public GATKSAMRecord apply(GATKSAMRecord read) { - bqsr.recalibrateRead(read); - return read; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BaseRecalibration.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BaseRecalibration.java deleted file mode 100644 index be64dc4de..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/BaseRecalibration.java +++ /dev/null @@ -1,207 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.SAMTag; -import htsjdk.samtools.SAMUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -/** - * Utility methods to facilitate on-the-fly base quality score recalibration. - * - * User: carneiro and rpoplin - * Date: 2/4/12 - */ - -public class BaseRecalibration { - private static Logger logger = Logger.getLogger(BaseRecalibration.class); - private final static boolean TEST_CACHING = false; - - private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) - private final RecalibrationTables recalibrationTables; - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation - - private final boolean disableIndelQuals; - private final int preserveQLessThan; - private final double globalQScorePrior; - private final boolean emitOriginalQuals; - - /** - * Constructor using a GATK Report file - * - * @param RECAL_FILE a GATK Report file containing the recalibration information - * @param quantizationLevels number of bins to quantize the quality scores - * @param disableIndelQuals if true, do not emit base indel qualities - * @param preserveQLessThan preserve quality scores less than this value - */ - public BaseRecalibration(final File RECAL_FILE, final int quantizationLevels, final boolean disableIndelQuals, final int preserveQLessThan, final boolean emitOriginalQuals, final double globalQScorePrior) { - RecalibrationReport recalibrationReport = new RecalibrationReport(RECAL_FILE); - - recalibrationTables = recalibrationReport.getRecalibrationTables(); - requestedCovariates = recalibrationReport.getRequestedCovariates(); - quantizationInfo = recalibrationReport.getQuantizationInfo(); - if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores - quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wants to use what's in the report. - quantizationInfo.quantizeQualityScores(quantizationLevels); - - this.disableIndelQuals = disableIndelQuals; - this.preserveQLessThan = preserveQLessThan; - this.globalQScorePrior = globalQScorePrior; - this.emitOriginalQuals = emitOriginalQuals; - } - - /** - * Recalibrates the base qualities of a read - * - * It updates the base qualities of the read with the new recalibrated qualities (for all event types) - * - * Implements a serial recalibration of the reads using the combinational table. - * First, we perform a positional recalibration, and then a subsequent dinuc correction. - * - * Given the full recalibration table, we perform the following preprocessing steps: - * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: - * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) - * - * @param read the read to recalibrate - */ - public void recalibrateRead(final GATKSAMRecord read) { - if (emitOriginalQuals && read.getAttribute(SAMTag.OQ.name()) == null) { // Save the old qualities if the tag isn't already taken in the read - try { - read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities())); - } catch (IllegalArgumentException e) { - throw new UserException.MalformedBAM(read, "illegal base quality encountered; " + e.getMessage()); - } - } - - final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, requestedCovariates); - final int readLength = read.getReadLength(); - - for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings - if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) { - read.setBaseQualities(null, errorModel); - continue; - } - - final byte[] quals = read.getBaseQualities(errorModel); - - // get the keyset for this base using the error model - final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); - - // the rg key is constant over the whole read, the global deltaQ is too - final int rgKey = fullReadKeySet[0][0]; - final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); - - if( empiricalQualRG != null ) { - final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); - - for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read - final byte origQual = quals[offset]; - - // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - if ( origQual >= preserveQLessThan ) { - // get the keyset for this base using the error model - final int[] keySet = fullReadKeySet[offset]; - final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal()); - final List empiricalQualCovs = new ArrayList(); - for (int i = 2; i < requestedCovariates.length; i++) { - if (keySet[i] < 0) { - continue; - } - empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal())); - } - - double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs ); - - // recalibrated quality is bound between 1 and MAX_QUAL - final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), RecalDatum.MAX_RECALIBRATED_Q_SCORE); - - // return the quantized version of the recalibrated quality - final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual); - - quals[offset] = recalibratedQualityScore; - } - } - } - - // finally update the base qualities in the read - read.setBaseQualities(quals, errorModel); - } - } - - @Ensures("result > 0.0") - protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { - final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon ); - final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) ); - double deltaQCovariates = 0.0; - for( final RecalDatum empiricalQualCov : empiricalQualCovs ) { - deltaQCovariates += ( empiricalQualCov == null ? 0.0 : empiricalQualCov.getEmpiricalQuality(deltaQReported + globalDeltaQ + epsilon) - (deltaQReported + globalDeltaQ + epsilon) ); - } - - return epsilon + globalDeltaQ + deltaQReported + deltaQCovariates; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizer.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizer.java deleted file mode 100644 index c33089449..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizer.java +++ /dev/null @@ -1,500 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.PrintStream; -import java.util.*; - -/** - * A general algorithm for quantizing quality score distributions to use a specific number of levels - * - * Takes a histogram of quality scores and a desired number of levels and produces a - * map from original quality scores -> quantized quality scores. - * - * Note that this data structure is fairly heavy-weight, holding lots of debugging and - * calculation information. If you want to use it efficiently at scale with lots of - * read groups the right way to do this: - * - * Map> map - * for each read group rg: - * hist = getQualHist(rg) - * QualQuantizer qq = new QualQuantizer(hist, nLevels, minInterestingQual) - * map.set(rg, qq.getOriginalToQuantizedMap()) - * - * This map would then be used to look up the appropriate original -> quantized - * quals for each read as it comes in. - * - * @author Mark Depristo - * @since 3/2/12 - */ -public class QualQuantizer { - final private static Set MY_EMPTY_SET = Collections.emptySet(); - - private static Logger logger = Logger.getLogger(QualQuantizer.class); - - /** - * Inputs to the QualQuantizer - */ - final int nLevels, minInterestingQual; - final List nObservationsPerQual; - - /** - * Map from original qual (e.g., Q30) to new quantized qual (e.g., Q28). - * - * Has the same range as nObservationsPerQual - */ - final List originalToQuantizedMap; - - /** Sorted set of qual intervals. - * - * After quantize() this data structure contains only the top-level qual intervals - */ - final TreeSet quantizedIntervals; - - /** - * Protected creator for testng use only - */ - protected QualQuantizer(final int minInterestingQual) { - this.nObservationsPerQual = Collections.emptyList(); - this.nLevels = 0; - this.minInterestingQual = minInterestingQual; - this.quantizedIntervals = null; - this.originalToQuantizedMap = null; - } - - /** - * Creates a QualQuantizer for the histogram that has nLevels - * - * Note this is the only interface to the system. After creating this object - * the map can be obtained via getOriginalToQuantizedMap() - * - * @param nObservationsPerQual A histogram of counts of bases with quality scores. Note that - * this histogram must start at 0 (i.e., get(0) => count of Q0 bases) and must include counts all the - * way up to the largest quality score possible in the reads. OK if the histogram includes many 0 - * count bins, as these are quantized for free. - * @param nLevels the desired number of distinct quality scores to represent the full original range. Must - * be at least 1. - * @param minInterestingQual All quality scores <= this value are considered uninteresting and are freely - * merged together. For example, if this value is 10, then Q0-Q10 are all considered free to merge, and - * quantized into a single value. For ILMN data with lots of Q2 bases this results in a Q2 bin containing - * all data with Q0-Q10. - */ - public QualQuantizer(final List nObservationsPerQual, final int nLevels, final int minInterestingQual) { - this.nObservationsPerQual = nObservationsPerQual; - this.nLevels = nLevels; - this.minInterestingQual = minInterestingQual; - - // some sanity checking - if ( Collections.min(nObservationsPerQual) < 0 ) throw new ReviewedGATKException("Quality score histogram has negative values at: " + Utils.join(", ", nObservationsPerQual)); - if ( nLevels < 0 ) throw new ReviewedGATKException("nLevels must be >= 0"); - if ( minInterestingQual < 0 ) throw new ReviewedGATKException("minInterestingQual must be >= 0"); - - // actually run the quantizer - this.quantizedIntervals = quantize(); - - // store the map - this.originalToQuantizedMap = intervalsToMap(quantizedIntervals); - } - - /** - * Represents an contiguous interval of quality scores. - * - * qStart and qEnd are inclusive, so qStart = qEnd = 2 is the quality score bin of 2 - */ - @Invariant({ - "qStart <= qEnd", - "qStart >= 0", - "qEnd <= 1000", - "nObservations >= 0", - "nErrors >= 0", - "nErrors <= nObservations", - "fixedQual >= -1 && fixedQual <= QualityUtils.MAX_SAM_QUAL_SCORE", - "mergeOrder >= 0"}) - protected final class QualInterval implements Comparable { - final int qStart, qEnd, fixedQual, level; - final long nObservations, nErrors; - final Set subIntervals; - - /** for debugging / visualization. When was this interval created? */ - int mergeOrder; - - protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level) { - this(qStart, qEnd, nObservations, nErrors, level, -1, MY_EMPTY_SET); - } - - protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final Set subIntervals) { - this(qStart, qEnd, nObservations, nErrors, level, -1, subIntervals); - } - - protected QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual) { - this(qStart, qEnd, nObservations, nErrors, level, fixedQual, MY_EMPTY_SET); - } - - @Requires("level >= 0") - public QualInterval(final int qStart, final int qEnd, final long nObservations, final long nErrors, final int level, final int fixedQual, final Set subIntervals) { - this.qStart = qStart; - this.qEnd = qEnd; - this.nObservations = nObservations; - this.nErrors = nErrors; - this.fixedQual = fixedQual; - this.level = level; - this.mergeOrder = 0; - this.subIntervals = Collections.unmodifiableSet(subIntervals); - } - - /** - * @return Human readable name of this interval: e.g., 10-12 - */ - public String getName() { - return qStart + "-" + qEnd; - } - - @Override - public String toString() { - return "QQ:" + getName(); - } - - /** - * @return the error rate (in real space) of this interval, or 0 if there are no observations - */ - @Ensures("result >= 0.0") - public double getErrorRate() { - if ( hasFixedQual() ) - return QualityUtils.qualToErrorProb((byte)fixedQual); - else if ( nObservations == 0 ) - return 0.0; - else - return (nErrors+1) / (1.0 * (nObservations+1)); - } - - /** - * @return the QUAL of the error rate of this interval, or the fixed qual if this interval was created with a fixed qual. - */ - @Ensures("result >= 0 && result <= QualityUtils.MAX_SAM_QUAL_SCORE") - public byte getQual() { - if ( ! hasFixedQual() ) - return QualityUtils.errorProbToQual(getErrorRate()); - else - return (byte)fixedQual; - } - - /** - * @return true if this bin is using a fixed qual - */ - public boolean hasFixedQual() { - return fixedQual != -1; - } - - @Override - public int compareTo(final QualInterval qualInterval) { - return Integer.valueOf(this.qStart).compareTo(qualInterval.qStart); - } - - /** - * Create a interval representing the merge of this interval and toMerge - * - * Errors and observations are combined - * Subintervals updated in order of left to right (determined by qStart) - * Level is 1 + highest level of this and toMerge - * Order must be updated elsewhere - * - * @param toMerge - * @return newly created merged QualInterval - */ - @Requires({"toMerge != null"}) - @Ensures({ - "result != null", - "result.nObservations >= this.nObservations", - "result.nObservations >= toMerge.nObservations", - "result.nErrors >= this.nErrors", - "result.nErrors >= toMerge.nErrors", - "result.qStart == Math.min(this.qStart, toMerge.qStart)", - "result.qEnd == Math.max(this.qEnd, toMerge.qEnd)", - "result.level > Math.max(this.level, toMerge.level)", - "result.subIntervals.size() == 2" - }) - public QualInterval merge(final QualInterval toMerge) { - final QualInterval left = this.compareTo(toMerge) < 0 ? this : toMerge; - final QualInterval right = this.compareTo(toMerge) < 0 ? toMerge : this; - - if ( left.qEnd + 1 != right.qStart ) - throw new ReviewedGATKException("Attempting to merge non-contiguous intervals: left = " + left + " right = " + right); - - final long nCombinedObs = left.nObservations + right.nObservations; - final long nCombinedErr = left.nErrors + right.nErrors; - - final int level = Math.max(left.level, right.level) + 1; - final Set subIntervals = new HashSet(Arrays.asList(left, right)); - QualInterval merged = new QualInterval(left.qStart, right.qEnd, nCombinedObs, nCombinedErr, level, subIntervals); - - return merged; - } - - public double getPenalty() { - return calcPenalty(getErrorRate()); - } - - - /** - * Calculate the penalty of this interval, given the overall error rate for the interval - * - * If the globalErrorRate is e, this value is: - * - * sum_i |log10(e_i) - log10(e)| * nObservations_i - * - * each the index i applies to all leaves of the tree accessible from this interval - * (found recursively from subIntervals as necessary) - * - * @param globalErrorRate overall error rate in real space against which we calculate the penalty - * @return the cost of approximating the bins in this interval with the globalErrorRate - */ - @Requires("globalErrorRate >= 0.0") - @Ensures("result >= 0.0") - private double calcPenalty(final double globalErrorRate) { - if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty - return 0.0; - - if ( subIntervals.isEmpty() ) { - // this is leave node - if ( this.qEnd <= minInterestingQual ) - // It's free to merge up quality scores below the smallest interesting one - return 0; - else { - return (Math.abs(Math.log10(getErrorRate()) - Math.log10(globalErrorRate))) * nObservations; - } - } else { - double sum = 0; - for ( final QualInterval interval : subIntervals ) - sum += interval.calcPenalty(globalErrorRate); - return sum; - } - } - } - - /** - * Main method for computing the quantization intervals. - * - * Invoked in the constructor after all input variables are initialized. Walks - * over the inputs and builds the min. penalty forest of intervals with exactly nLevel - * root nodes. Finds this min. penalty forest via greedy search, so is not guarenteed - * to find the optimal combination. - * - * TODO: develop a smarter algorithm - * - * @return the forest of intervals with size == nLevels - */ - @Ensures({"! result.isEmpty()", "result.size() == nLevels"}) - private TreeSet quantize() { - // create intervals for each qual individually - final TreeSet intervals = new TreeSet(); - for ( int qStart = 0; qStart < getNQualsInHistogram(); qStart++ ) { - final long nObs = nObservationsPerQual.get(qStart); - final double errorRate = QualityUtils.qualToErrorProb((byte)qStart); - final double nErrors = nObs * errorRate; - final QualInterval qi = new QualInterval(qStart, qStart, nObs, (int)Math.floor(nErrors), 0, (byte)qStart); - intervals.add(qi); - } - - // greedy algorithm: - // while ( n intervals >= nLevels ): - // find intervals to merge with least penalty - // merge it - while ( intervals.size() > nLevels ) { - mergeLowestPenaltyIntervals(intervals); - } - - return intervals; - } - - /** - * Helper function that finds and merges together the lowest penalty pair of intervals - * @param intervals - */ - @Requires("! intervals.isEmpty()") - private void mergeLowestPenaltyIntervals(final TreeSet intervals) { - // setup the iterators - final Iterator it1 = intervals.iterator(); - final Iterator it1p = intervals.iterator(); - it1p.next(); // skip one - - // walk over the pairs of left and right, keeping track of the pair with the lowest merge penalty - QualInterval minMerge = null; - if ( logger.isDebugEnabled() ) logger.debug("mergeLowestPenaltyIntervals: " + intervals.size()); - int lastMergeOrder = 0; - while ( it1p.hasNext() ) { - final QualInterval left = it1.next(); - final QualInterval right = it1p.next(); - final QualInterval merged = left.merge(right); - lastMergeOrder = Math.max(Math.max(lastMergeOrder, left.mergeOrder), right.mergeOrder); - if ( minMerge == null || (merged.getPenalty() < minMerge.getPenalty() ) ) { - if ( logger.isDebugEnabled() ) logger.debug(" Updating merge " + minMerge); - minMerge = merged; - } - } - - // now actually go ahead and merge the minMerge pair - if ( logger.isDebugEnabled() ) logger.debug(" => final min merge " + minMerge); - intervals.removeAll(minMerge.subIntervals); - intervals.add(minMerge); - minMerge.mergeOrder = lastMergeOrder + 1; - if ( logger.isDebugEnabled() ) logger.debug("updated intervals: " + intervals); - } - - /** - * Given a final forest of intervals constructs a list mapping - * list.get(i) => quantized qual to use for original quality score i - * - * This function should be called only once to initialize the corresponding - * cached value in this object, as the calculation is a bit costly. - * - * @param intervals - * @return - */ - @Ensures("result.size() == getNQualsInHistogram()") - private List intervalsToMap(final TreeSet intervals) { - final List map = new ArrayList(getNQualsInHistogram()); - map.addAll(Collections.nCopies(getNQualsInHistogram(), Byte.MIN_VALUE)); - for ( final QualInterval interval : intervals ) { - for ( int q = interval.qStart; q <= interval.qEnd; q++ ) { - map.set(q, interval.getQual()); - } - } - - if ( Collections.min(map) == Byte.MIN_VALUE ) - throw new ReviewedGATKException("quantized quality score map contains an un-initialized value"); - - return map; - } - - @Ensures("result > 0") - private final int getNQualsInHistogram() { - return nObservationsPerQual.size(); - } - - /** - * Write out a GATKReport to visualize the QualQuantization process of this data - * @param out - */ - public void writeReport(PrintStream out) { - final GATKReport report = new GATKReport(); - - addQualHistogramToReport(report); - addIntervalsToReport(report); - - report.print(out); - } - - private final void addQualHistogramToReport(final GATKReport report) { - report.addTable("QualHistogram", "Quality score histogram provided to report", 2); - GATKReportTable table = report.getTable("QualHistogram"); - - table.addColumn("qual"); - table.addColumn("count"); - - for ( int q = 0; q < nObservationsPerQual.size(); q++ ) { - table.set(q, "qual", q); - table.set(q, "count", nObservationsPerQual.get(q)); - } - } - - - private final void addIntervalsToReport(final GATKReport report) { - report.addTable("QualQuantizerIntervals", "Table of QualQuantizer quantization intervals", 10); - GATKReportTable table = report.getTable("QualQuantizerIntervals"); - - table.addColumn("name"); - table.addColumn("qStart"); - table.addColumn("qEnd"); - table.addColumn("level"); - table.addColumn("merge.order"); - table.addColumn("nErrors"); - table.addColumn("nObservations"); - table.addColumn("qual"); - table.addColumn("penalty"); - table.addColumn("root.node"); - //table.addColumn("subintervals", "NA"); - - for ( QualInterval interval : quantizedIntervals ) - addIntervalToReport(table, interval, true); - } - - private final void addIntervalToReport(final GATKReportTable table, final QualInterval interval, final boolean atRootP) { - final String name = interval.getName(); - table.set(name, "name", name); - table.set(name, "qStart", interval.qStart); - table.set(name, "qEnd", interval.qEnd); - table.set(name, "level", interval.level); - table.set(name, "merge.order", interval.mergeOrder); - table.set(name, "nErrors", interval.nErrors); - table.set(name, "nObservations", interval.nObservations); - table.set(name, "qual", interval.getQual()); - table.set(name, "penalty", String.format("%.1f", interval.getPenalty())); - table.set(name, "root.node", atRootP); - - for ( final QualInterval sub : interval.subIntervals ) - addIntervalToReport(table, sub, false); - } - - public List getOriginalToQuantizedMap() { - return originalToQuantizedMap; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QuantizationInfo.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QuantizationInfo.java deleted file mode 100644 index 001643b07..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/QuantizationInfo.java +++ /dev/null @@ -1,151 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; - -import java.util.Arrays; -import java.util.List; - -/** - * Class that encapsulates the information necessary for quality score quantization for BQSR - * - * @author carneiro - * @since 3/26/12 - */ -public class QuantizationInfo { - private List quantizedQuals; - private List empiricalQualCounts; - private int quantizationLevels; - - private QuantizationInfo(List quantizedQuals, List empiricalQualCounts, int quantizationLevels) { - this.quantizedQuals = quantizedQuals; - this.empiricalQualCounts = empiricalQualCounts; - this.quantizationLevels = quantizationLevels; - } - - public QuantizationInfo(List quantizedQuals, List empiricalQualCounts) { - this(quantizedQuals, empiricalQualCounts, calculateQuantizationLevels(quantizedQuals)); - } - - public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { - final Long [] qualHistogram = new Long[QualityUtils.MAX_SAM_QUAL_SCORE +1]; // create a histogram with the empirical quality distribution - for (int i = 0; i < qualHistogram.length; i++) - qualHistogram[i] = 0L; - - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table - - for (final RecalDatum value : qualTable.getAllValues()) { - final RecalDatum datum = value; - final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key - } - empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities - quantizeQualityScores(quantizationLevels); - - this.quantizationLevels = quantizationLevels; - } - - - public void quantizeQualityScores(int nLevels) { - QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels - quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) - } - - public void noQuantization() { - this.quantizationLevels = QualityUtils.MAX_SAM_QUAL_SCORE; - for (int i = 0; i < this.quantizationLevels; i++) - quantizedQuals.set(i, (byte) i); - } - - public List getQuantizedQuals() { - return quantizedQuals; - } - - public int getQuantizationLevels() { - return quantizationLevels; - } - - public GATKReportTable generateReportTable(boolean sortByCols) { - GATKReportTable quantizedTable; - if(sortByCols) { - quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); - } else { - quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); - } - quantizedTable.addColumn(RecalUtils.QUALITY_SCORE_COLUMN_NAME); - quantizedTable.addColumn(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); - quantizedTable.addColumn(RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); - - for (int qual = 0; qual <= QualityUtils.MAX_SAM_QUAL_SCORE; qual++) { - quantizedTable.set(qual, RecalUtils.QUALITY_SCORE_COLUMN_NAME, qual); - quantizedTable.set(qual, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual)); - quantizedTable.set(qual, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual)); - } - return quantizedTable; - } - - private static int calculateQuantizationLevels(List quantizedQuals) { - byte lastByte = -1; - int quantizationLevels = 0; - for (byte q : quantizedQuals) { - if (q != lastByte) { - quantizationLevels++; - lastByte = q; - } - } - return quantizationLevels; - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariates.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariates.java deleted file mode 100644 index e2aed8b48..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariates.java +++ /dev/null @@ -1,175 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.LRUCache; - -/** - * The object temporarily held by a read that describes all of it's covariates. - * - * In essence, this is an array of CovariateValues, but it also has some functionality to deal with the optimizations of the NestedHashMap - * - * @author Mauricio Carneiro - * @since 2/8/12 - */ -public class ReadCovariates { - private final static Logger logger = Logger.getLogger(ReadCovariates.class); - - /** - * How big should we let the LRU cache grow - */ - private static final int LRU_CACHE_SIZE = 500; - - /** - * Use an LRU cache to keep cache of keys (int[][][]) arrays for each read length we've seen. - * The cache allows us to avoid the expense of recreating these arrays for every read. The LRU - * keeps the total number of cached arrays to less than LRU_CACHE_SIZE. - * - * This is a thread local variable, so the total memory required may grow to N_THREADS x LRU_CACHE_SIZE - */ - private final static ThreadLocal> keysCache = new ThreadLocal>() { - @Override protected LRUCache initialValue() { - return new LRUCache(LRU_CACHE_SIZE); - } - }; - - /** - * The keys cache is only valid for a single covariate count. Normally this will remain constant for the analysis. - * If running multiple analyses (or the unit test suite), it's necessary to clear the cache. - */ - public static void clearKeysCache() { - keysCache.remove(); - } - - /** - * Our keys, indexed by event type x read length x covariate - */ - private final int[][][] keys; - - /** - * The index of the current covariate, used by addCovariate - */ - private int currentCovariateIndex = 0; - - public ReadCovariates(final int readLength, final int numberOfCovariates) { - final LRUCache cache = keysCache.get(); - final int[][][] cachedKeys = cache.get(readLength); - if ( cachedKeys == null ) { - // There's no cached value for read length so we need to create a new int[][][] array - if ( logger.isDebugEnabled() ) logger.debug("Keys cache miss for length " + readLength + " cache size " + cache.size()); - keys = new int[EventType.values().length][readLength][numberOfCovariates]; - cache.put(readLength, keys); - } else { - keys = cachedKeys; - } - } - - public void setCovariateIndex(final int index) { - currentCovariateIndex = index; - } - - /** - * Update the keys for mismatch, insertion, and deletion for the current covariate at read offset - * - * NOTE: no checks are performed on the number of covariates, for performance reasons. If the count increases - * after the keysCache has been accessed, this method will throw an ArrayIndexOutOfBoundsException. This currently - * only occurs in the testing harness, and we don't anticipate that it will become a part of normal runs. - * - * @param mismatch the mismatch key value - * @param insertion the insertion key value - * @param deletion the deletion key value - * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates - */ - public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { - keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; - keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; - keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; - } - - /** - * Get the keys for all covariates at read position for error model - * - * @param readPosition - * @param errorModel - * @return - */ - public int[] getKeySet(final int readPosition, final EventType errorModel) { - return keys[errorModel.ordinal()][readPosition]; - } - - public int[][] getKeySet(final EventType errorModel) { - return keys[errorModel.ordinal()]; - } - - // ---------------------------------------------------------------------- - // - // routines for testing - // - // ---------------------------------------------------------------------- - - protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } - protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } - protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } - - protected int[] getMismatchesKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); - } - - protected int[] getInsertionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_INSERTION); - } - - protected int[] getDeletionsKeySet(final int readPosition) { - return getKeySet(readPosition, EventType.BASE_DELETION); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatum.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatum.java deleted file mode 100644 index 6cfc435f7..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatum.java +++ /dev/null @@ -1,434 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import htsjdk.samtools.SAMUtils; -import org.apache.commons.math.optimization.fitting.GaussianFunction; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; - - -/** - * An individual piece of recalibration data. Each bin counts up the number of observations and the number - * of reference mismatches seen for that combination of covariates. - * - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - */ -@Invariant({ - "estimatedQReported >= 0.0", - "! Double.isNaN(estimatedQReported)", - "! Double.isInfinite(estimatedQReported)", - "empiricalQuality >= 0.0 || empiricalQuality == UNINITIALIZED", - "! Double.isNaN(empiricalQuality)", - "! Double.isInfinite(empiricalQuality)", - "numObservations >= 0", - "numMismatches >= 0", - "numMismatches <= numObservations" -}) -public class RecalDatum { - public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; - private static final double UNINITIALIZED = -1.0; - - /** - * estimated reported quality score based on combined data's individual q-reporteds and number of observations - */ - private double estimatedQReported; - - /** - * the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) - */ - private double empiricalQuality; - - /** - * number of bases seen in total - */ - private long numObservations; - - /** - * number of bases seen that didn't match the reference - */ - private double numMismatches; - - /** - * used when calculating empirical qualities to avoid division by zero - */ - private static final int SMOOTHING_CONSTANT = 1; - - //--------------------------------------------------------------------------------------------------------------- - // - // constructors - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Create a new RecalDatum with given observation and mismatch counts, and an reported quality - * - * @param _numObservations observations - * @param _numMismatches mismatches - * @param reportedQuality Qreported - */ - public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) { - if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); - if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0"); - if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); - - numObservations = _numObservations; - numMismatches = _numMismatches; - estimatedQReported = reportedQuality; - empiricalQuality = UNINITIALIZED; - } - - /** - * Copy copy into this recal datum, overwriting all of this objects data - * @param copy RecalDatum to copy - */ - public RecalDatum(final RecalDatum copy) { - this.numObservations = copy.getNumObservations(); - this.numMismatches = copy.getNumMismatches(); - this.estimatedQReported = copy.estimatedQReported; - this.empiricalQuality = copy.empiricalQuality; - } - - /** - * Add in all of the data from other into this object, updating the reported quality from the expected - * error rate implied by the two reported qualities - * - * @param other RecalDatum to combine - */ - public synchronized void combine(final RecalDatum other) { - final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); - increment(other.getNumObservations(), other.getNumMismatches()); - estimatedQReported = -10 * Math.log10(sumErrors / getNumObservations()); - empiricalQuality = UNINITIALIZED; - } - - public synchronized void setEstimatedQReported(final double estimatedQReported) { - if ( estimatedQReported < 0 ) throw new IllegalArgumentException("estimatedQReported < 0"); - if ( Double.isInfinite(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is infinite"); - if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN"); - - this.estimatedQReported = estimatedQReported; - empiricalQuality = UNINITIALIZED; - } - - public final double getEstimatedQReported() { - return estimatedQReported; - } - public final byte getEstimatedQReportedAsByte() { - return (byte)(int)(Math.round(getEstimatedQReported())); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // Empirical quality score -- derived from the num mismatches and observations - // - //--------------------------------------------------------------------------------------------------------------- - - /** - * Returns the error rate (in real space) of this interval, or 0 if there are no observations - * @return the empirical error rate ~= N errors / N obs - */ - @Ensures({"result >= 0.0"}) - public double getEmpiricalErrorRate() { - if ( numObservations == 0 ) - return 0.0; - else { - // cache the value so we don't call log over and over again - final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT; - // smoothing is one error and one non-error observation, for example - final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; - return doubleMismatches / doubleObservations; - } - } - - public synchronized void setEmpiricalQuality(final double empiricalQuality) { - if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0"); - if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite"); - if ( Double.isNaN(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is NaN"); - - this.empiricalQuality = empiricalQuality; - } - - public final double getEmpiricalQuality() { - return getEmpiricalQuality(getEstimatedQReported()); - } - - public synchronized final double getEmpiricalQuality(final double conditionalPrior) { - if (empiricalQuality == UNINITIALIZED) { - calcEmpiricalQuality(conditionalPrior); - } - return empiricalQuality; - } - - public final byte getEmpiricalQualityAsByte() { - return (byte)(Math.round(getEmpiricalQuality())); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // toString methods - // - //--------------------------------------------------------------------------------------------------------------- - - @Override - public String toString() { - return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); - } - - public String stringForCSV() { - return String.format("%s,%.2f,%.2f", toString(), getEstimatedQReported(), getEmpiricalQuality() - getEstimatedQReported()); - } - - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - public final long getNumObservations() { - return numObservations; - } - - public final synchronized void setNumObservations(final long numObservations) { - if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); - this.numObservations = numObservations; - empiricalQuality = UNINITIALIZED; - } - - public final double getNumMismatches() { - return numMismatches; - } - - @Requires({"numMismatches >= 0"}) - public final synchronized void setNumMismatches(final double numMismatches) { - if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); - this.numMismatches = numMismatches; - empiricalQuality = UNINITIALIZED; - } - - @Requires({"by >= 0"}) - public final synchronized void incrementNumObservations(final long by) { - numObservations += by; - empiricalQuality = UNINITIALIZED; - } - - @Requires({"by >= 0"}) - public final synchronized void incrementNumMismatches(final double by) { - numMismatches += by; - empiricalQuality = UNINITIALIZED; - } - - @Requires({"incObservations >= 0", "incMismatches >= 0"}) - @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) - public final synchronized void increment(final long incObservations, final double incMismatches) { - numObservations += incObservations; - numMismatches += incMismatches; - empiricalQuality = UNINITIALIZED; - } - - @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) - public final synchronized void increment(final boolean isError) { - increment(1, isError ? 1.0 : 0.0); - } - - // ------------------------------------------------------------------------------------- - // - // Private implementation helper functions - // - // ------------------------------------------------------------------------------------- - - /** - * calculate the expected number of errors given the estimated Q reported and the number of observations - * in this datum. - * - * @return a positive (potentially fractional) estimate of the number of errors - */ - @Ensures("result >= 0.0") - private double calcExpectedErrors() { - return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); - } - - /** - * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) - */ - @Requires("empiricalQuality == UNINITIALIZED") - @Ensures("empiricalQuality != UNINITIALIZED") - private synchronized void calcEmpiricalQuality(final double conditionalPrior) { - - // smoothing is one error and one non-error observation - final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; - final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; - - final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, conditionalPrior); - - // This is the old and busted point estimate approach: - //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); - - empiricalQuality = Math.min(empiricalQual, (double) MAX_RECALIBRATED_Q_SCORE); - } - - //static final boolean DEBUG = false; - static private final double RESOLUTION_BINS_PER_QUAL = 1.0; - - static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) { - - final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; - - final double[] log10Posteriors = new double[numBins]; - - for ( int bin = 0; bin < numBins; bin++ ) { - - final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; - - log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors); - - //if ( DEBUG ) - // System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); - } - - //if ( DEBUG ) - // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); - - final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); - final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); - - final double Qemp = MLEbin / RESOLUTION_BINS_PER_QUAL; - return Qemp; - } - - /** - * Quals above this value should be capped down to this value (because they are too high) - * in the base quality score recalibrator - */ - public final static byte MAX_GATK_USABLE_Q_SCORE = 40; - static private final double[] log10QempPriorCache = new double[MAX_GATK_USABLE_Q_SCORE + 1]; - static { - // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) - // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". - final double GF_a = 0.0; - final double GF_b = 0.9; - final double GF_c = 0.0; - final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points - - final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); - for ( int i = 0; i <= MAX_GATK_USABLE_Q_SCORE; i++ ) { - double log10Prior = Math.log10(gaussian.value((double) i)); - if ( Double.isInfinite(log10Prior) ) - log10Prior = -Double.MAX_VALUE; - log10QempPriorCache[i] = log10Prior; - } - } - - static protected double log10QempPrior(final double Qempirical, final double Qreported) { - final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), MAX_GATK_USABLE_Q_SCORE); - //if ( DEBUG ) - // System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); - return log10QempPriorCache[difference]; - } - - static private final long MAX_NUMBER_OF_OBSERVATIONS = Integer.MAX_VALUE - 1; - - static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) { - if ( nObservations == 0 ) - return 0.0; - - // the binomial code requires ints as input (because it does caching). This should theoretically be fine because - // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow - // before casting down to an int. - if ( nObservations > MAX_NUMBER_OF_OBSERVATIONS ) { - // we need to decrease nErrors by the same fraction that we are decreasing nObservations - final double fraction = (double)MAX_NUMBER_OF_OBSERVATIONS / (double)nObservations; - nErrors = Math.round((double)nErrors * fraction); - nObservations = MAX_NUMBER_OF_OBSERVATIONS; - } - - // this is just a straight binomial PDF - double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10(Qempirical)); - if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) - log10Prob = -Double.MAX_VALUE; - - //if ( DEBUG ) - // System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); - - return log10Prob; - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumNode.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumNode.java deleted file mode 100644 index f3759cdb7..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumNode.java +++ /dev/null @@ -1,582 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.math.MathException; -import org.apache.commons.math.stat.inference.ChiSquareTestImpl; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.Collection; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.Set; - -/** - * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one - * - * @author Mark DePristo - * @since 07/27/12 - */ -public class RecalDatumNode { - private final static double SMALLEST_CHI2_PVALUE = 1e-300; - protected static final Logger logger = Logger.getLogger(RecalDatumNode.class); - - /** - * fixedPenalty is this value if it's considered fixed - */ - private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY; - - private final T recalDatum; - private double fixedPenalty = UNINITIALIZED; - private final Set> subnodes; - - @Requires({"recalDatum != null"}) - public RecalDatumNode(final T recalDatum) { - this(recalDatum, new HashSet>()); - } - - @Override - public String toString() { - return recalDatum.toString(); - } - - @Requires({"recalDatum != null", "subnodes != null"}) - public RecalDatumNode(final T recalDatum, final Set> subnodes) { - this(recalDatum, UNINITIALIZED, subnodes); - } - - @Requires({"recalDatum != null"}) - protected RecalDatumNode(final T recalDatum, final double fixedPenalty) { - this(recalDatum, fixedPenalty, new HashSet>()); - } - - @Requires({"recalDatum != null", "subnodes != null"}) - protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set> subnodes) { - this.recalDatum = recalDatum; - this.fixedPenalty = fixedPenalty; - this.subnodes = new HashSet>(subnodes); - } - - /** - * Get the recal data associated with this node - * @return - */ - @Ensures("result != null") - public T getRecalDatum() { - return recalDatum; - } - - /** - * The set of all subnodes of this tree. May be modified. - * @return - */ - @Ensures("result != null") - public Set> getSubnodes() { - return subnodes; - } - - /** - * Return the fixed penalty, if set, or else the the calculated penalty for this node - * @return - */ - public double getPenalty() { - if ( fixedPenalty != UNINITIALIZED ) - return fixedPenalty; - else - return calcPenalty(); - } - - /** - * Set the fixed penalty for this node to a fresh calculation from calcPenalty - * - * This is important in the case where you want to compute the penalty from a full - * tree and then chop the tree up afterwards while considering the previous penalties. - * If you don't call this function then manipulating the tree may result in the - * penalty functions changing with changes in the tree. - * - * @param doEntireTree recurse into all subnodes? - * @return the fixed penalty for this node - */ - public double calcAndSetFixedPenalty(final boolean doEntireTree) { - fixedPenalty = calcPenalty(); - if ( doEntireTree ) - for ( final RecalDatumNode sub : subnodes ) - sub.calcAndSetFixedPenalty(doEntireTree); - return fixedPenalty; - } - - /** - * Add node to the set of subnodes of this node - * @param sub - */ - @Requires("sub != null") - public void addSubnode(final RecalDatumNode sub) { - subnodes.add(sub); - } - - /** - * Is this a leaf node (i.e., has no subnodes)? - * @return - */ - public boolean isLeaf() { - return subnodes.isEmpty(); - } - - /** - * Is this node immediately above only leaf nodes? - * - * @return - */ - public boolean isAboveOnlyLeaves() { - for ( final RecalDatumNode sub : subnodes ) - if ( ! sub.isLeaf() ) - return false; - return true; - } - - /** - * What's the immediate number of subnodes from this node? - * @return - */ - @Ensures("result >= 0") - public int getNumSubnodes() { - return subnodes.size(); - } - - /** - * Total penalty is the sum of leaf node penalties - * - * This algorithm assumes that penalties have been fixed before pruning, as leaf nodes by - * definition have 0 penalty unless they represent a pruned tree with underlying -- but now - * pruned -- subtrees - * - * @return - */ - public double totalPenalty() { - if ( isLeaf() ) - return getPenalty(); - else { - double sum = 0.0; - for ( final RecalDatumNode sub : subnodes ) - sum += sub.totalPenalty(); - return sum; - } - } - - /** - * The maximum penalty among all nodes - * @return - */ - public double maxPenalty(final boolean leafOnly) { - double max = ! leafOnly || isLeaf() ? getPenalty() : Double.MIN_VALUE; - for ( final RecalDatumNode sub : subnodes ) - max = Math.max(max, sub.maxPenalty(leafOnly)); - return max; - } - - /** - * The minimum penalty among all nodes - * @return - */ - public double minPenalty(final boolean leafOnly) { - double min = ! leafOnly || isLeaf() ? getPenalty() : Double.MAX_VALUE; - for ( final RecalDatumNode sub : subnodes ) - min = Math.min(min, sub.minPenalty(leafOnly)); - return min; - } - - /** - * What's the longest branch from this node to any leaf? - * @return - */ - public int maxDepth() { - int subMax = 0; - for ( final RecalDatumNode sub : subnodes ) - subMax = Math.max(subMax, sub.maxDepth()); - return subMax + 1; - } - - /** - * What's the shortest branch from this node to any leaf? Includes this node - * @return - */ - @Ensures("result > 0") - public int minDepth() { - if ( isLeaf() ) - return 1; - else { - int subMin = Integer.MAX_VALUE; - for ( final RecalDatumNode sub : subnodes ) - subMin = Math.min(subMin, sub.minDepth()); - return subMin + 1; - } - } - - /** - * Return the number of nodes, including this one, reachable from this node - * @return - */ - @Ensures("result > 0") - public int size() { - int size = 1; - for ( final RecalDatumNode sub : subnodes ) - size += sub.size(); - return size; - } - - /** - * Count the number of leaf nodes reachable from this node - * - * @return - */ - @Ensures("result >= 0") - public int numLeaves() { - if ( isLeaf() ) - return 1; - else { - int size = 0; - for ( final RecalDatumNode sub : subnodes ) - size += sub.numLeaves(); - return size; - } - } - - /** - * Calculate the phred-scaled p-value for a chi^2 test for independent among subnodes of this node. - * - * The chi^2 value indicates the degree of independence of the implied error rates among the - * immediate subnodes - * - * @return the phred-scaled p-value for chi2 penalty, or 0.0 if it cannot be calculated - */ - private double calcPenalty() { - if ( isLeaf() || freeToMerge() ) - return 0.0; - else if ( subnodes.size() == 1 ) - // only one value, so its free to merge away - return 0.0; - else { - final long[][] counts = new long[subnodes.size()][2]; - - int i = 0; - for ( final RecalDatumNode subnode : subnodes ) { - // use the yates correction to help avoid all zeros => NaN - counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; - counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L; - i++; - } - - try { - final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts); - final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); - - // make sure things are reasonable and fail early if not - if (Double.isInfinite(penalty) || Double.isNaN(penalty)) - throw new ReviewedGATKException("chi2 value is " + chi2PValue + " at " + getRecalDatum()); - - return penalty; - } catch ( MathException e ) { - throw new ReviewedGATKException("Failed in calculating chi2 value", e); - } - } - } - - /** - * Is this node free to merge because its rounded Q score is the same as all nodes below - * @return - */ - private boolean freeToMerge() { - if ( isLeaf() ) // leaves are free to merge - return true; - else { - final byte myQual = getRecalDatum().getEmpiricalQualityAsByte(); - for ( final RecalDatumNode sub : subnodes ) - if ( sub.getRecalDatum().getEmpiricalQualityAsByte() != myQual ) - return false; - return true; - } - } - - /** - * Calculate the penalty of this interval, given the overall error rate for the interval - * - * If the globalErrorRate is e, this value is: - * - * sum_i |log10(e_i) - log10(e)| * nObservations_i - * - * each the index i applies to all leaves of the tree accessible from this interval - * (found recursively from subnodes as necessary) - * - * @param globalErrorRate overall error rate in real space against which we calculate the penalty - * @return the cost of approximating the bins in this interval with the globalErrorRate - */ - @Requires("globalErrorRate >= 0.0") - @Ensures("result >= 0.0") - private double calcPenaltyLog10(final double globalErrorRate) { - if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty - return 0.0; - - if ( isLeaf() ) { - // this is leave node - return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations(); - // TODO -- how we can generalize this calculation? -// if ( this.qEnd <= minInterestingQual ) -// // It's free to merge up quality scores below the smallest interesting one -// return 0; -// else { -// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); -// } - } else { - double sum = 0; - for ( final RecalDatumNode hrd : subnodes) - sum += hrd.calcPenaltyLog10(globalErrorRate); - return sum; - } - } - - /** - * Return a freshly allocated tree prunes to have no more than maxDepth from the root to any leaf - * - * @param maxDepth - * @return - */ - public RecalDatumNode pruneToDepth(final int maxDepth) { - if ( maxDepth < 1 ) - throw new IllegalArgumentException("maxDepth < 1"); - else { - final Set> subPruned = new HashSet>(getNumSubnodes()); - if ( maxDepth > 1 ) - for ( final RecalDatumNode sub : subnodes ) - subPruned.add(sub.pruneToDepth(maxDepth - 1)); - return new RecalDatumNode(getRecalDatum(), fixedPenalty, subPruned); - } - } - - /** - * Return a freshly allocated tree with to no more than maxElements in order of penalty - * - * Note that nodes must have fixed penalties to this algorithm will fail. - * - * @param maxElements - * @return - */ - public RecalDatumNode pruneByPenalty(final int maxElements) { - RecalDatumNode root = this; - - while ( root.size() > maxElements ) { - // remove the lowest penalty element, and continue - root = root.removeLowestPenaltyNode(); - } - - // our size is below the target, so we are good, return - return root; - } - - /** - * Return a freshly allocated tree where all mergable nodes with < maxPenalty are merged - * - * Note that nodes must have fixed penalties to this algorithm will fail. - * - * @param maxPenaltyIn the maximum penalty we are allowed to incur for a merge - * @param applyBonferroniCorrection if true, we will adjust penalty by the phred-scaled bonferroni correction - * for the size of the initial tree. That is, if there are 10 nodes in the - * tree and maxPenalty is 20 we will actually enforce 10^-2 / 10 = 10^-3 = 30 - * penalty for multiple testing - * @return - */ - public RecalDatumNode pruneToNoMoreThanPenalty(final double maxPenaltyIn, final boolean applyBonferroniCorrection) { - RecalDatumNode root = this; - - final double bonferroniCorrection = 10 * Math.log10(this.size()); - final double maxPenalty = applyBonferroniCorrection ? maxPenaltyIn + bonferroniCorrection : maxPenaltyIn; - - if ( applyBonferroniCorrection ) - logger.info(String.format("Applying Bonferroni correction for %d nodes = %.2f to initial penalty %.2f for total " + - "corrected max penalty of %.2f", this.size(), bonferroniCorrection, maxPenaltyIn, maxPenalty)); - - while ( true ) { - final Pair, Double> minPenaltyNode = root.getMinPenaltyAboveLeafNode(); - - if ( minPenaltyNode == null || minPenaltyNode.getSecond() > maxPenalty ) { - // nothing to merge, or the best candidate is above our max allowed - if ( minPenaltyNode == null ) { - if ( logger.isDebugEnabled() ) logger.debug("Stopping because no candidates could be found"); - } else { - if ( logger.isDebugEnabled() ) logger.debug("Stopping because node " + minPenaltyNode.getFirst() + " has penalty " + minPenaltyNode.getSecond() + " > max " + maxPenalty); - } - break; - } else { - // remove the lowest penalty element, and continue - if ( logger.isDebugEnabled() ) logger.debug("Removing node " + minPenaltyNode.getFirst() + " with penalty " + minPenaltyNode.getSecond()); - root = root.removeLowestPenaltyNode(); - } - } - - // no more candidates exist with penalty < maxPenalty - return root; - } - - - /** - * Find the lowest penalty above leaf node in the tree, and return a tree without it - * - * Note this excludes the current (root) node - * - * @return - */ - private RecalDatumNode removeLowestPenaltyNode() { - final Pair, Double> nodeToRemove = getMinPenaltyAboveLeafNode(); - if ( logger.isDebugEnabled() ) - logger.debug("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond()); - - final Pair, Boolean> result = removeNode(nodeToRemove.getFirst()); - - if ( ! result.getSecond() ) - throw new IllegalStateException("Never removed any node!"); - - final RecalDatumNode oneRemoved = result.getFirst(); - if ( oneRemoved == null ) - throw new IllegalStateException("Removed our root node, wow, didn't expect that"); - return oneRemoved; - } - - /** - * Finds in the tree the node with the lowest penalty whose subnodes are all leaves - * - * @return the node and its penalty, or null if no such node exists - */ - private Pair, Double> getMinPenaltyAboveLeafNode() { - if ( isLeaf() ) - // not allowed to remove leafs directly - return null; - if ( isAboveOnlyLeaves() ) - // we only consider removing nodes above all leaves - return new Pair, Double>(this, getPenalty()); - else { - // just recurse, taking the result with the min penalty of all subnodes - Pair, Double> minNode = null; - for ( final RecalDatumNode sub : subnodes ) { - final Pair, Double> subFind = sub.getMinPenaltyAboveLeafNode(); - if ( subFind != null && (minNode == null || subFind.getSecond() < minNode.getSecond()) ) { - minNode = subFind; - } - } - return minNode; - } - } - - /** - * Return a freshly allocated tree without the node nodeToRemove - * - * @param nodeToRemove - * @return - */ - private Pair, Boolean> removeNode(final RecalDatumNode nodeToRemove) { - if ( this == nodeToRemove ) { - if ( isLeaf() ) - throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + nodeToRemove); - // node is the thing we are going to remove, but without any subnodes - final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty); - return new Pair, Boolean>(node, true); - } else { - // did we remove something in a sub branch? - boolean removedSomething = false; - - // our sub nodes with the penalty node removed - final Set> sub = new HashSet>(getNumSubnodes()); - - for ( final RecalDatumNode sub1 : subnodes ) { - if ( removedSomething ) { - // already removed something, just add sub1 back to sub - sub.add(sub1); - } else { - // haven't removed anything yet, so try - final Pair, Boolean> maybeRemoved = sub1.removeNode(nodeToRemove); - removedSomething = maybeRemoved.getSecond(); - sub.add(maybeRemoved.getFirst()); - } - } - - final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty, sub); - return new Pair, Boolean>(node, removedSomething); - } - } - - /** - * Return a collection of all of the data in the leaf nodes of this tree - * - * @return - */ - public Collection getAllLeaves() { - final LinkedList list = new LinkedList(); - getAllLeavesRec(list); - return list; - } - - /** - * Helpful recursive function for getAllLeaves() - * - * @param list the destination for the list of leaves - */ - private void getAllLeavesRec(final LinkedList list) { - if ( isLeaf() ) - list.add(getRecalDatum()); - else { - for ( final RecalDatumNode sub : subnodes ) - sub.getAllLeavesRec(list); - } - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalUtils.java deleted file mode 100644 index 9c5739466..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalUtils.java +++ /dev/null @@ -1,1097 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.classloader.JVMUtils; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.R.RScriptExecutor; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.io.Resource; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 6, 2009 - * - * This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions. - * It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias. - * This class holds the parsing methods that are shared between BaseRecalibrator and PrintReads. - */ - -public class RecalUtils { - public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; - public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; - public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; - public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; - public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; - - public final static String ARGUMENT_COLUMN_NAME = "Argument"; - public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; - public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; - public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; - public final static String READGROUP_COLUMN_NAME = "ReadGroup"; - public final static String EVENT_TYPE_COLUMN_NAME = "EventType"; - public final static String EMPIRICAL_QUALITY_COLUMN_NAME = "EmpiricalQuality"; - public final static String ESTIMATED_Q_REPORTED_COLUMN_NAME = "EstimatedQReported"; - public final static String QUALITY_SCORE_COLUMN_NAME = "QualityScore"; - public final static String COVARIATE_VALUE_COLUMN_NAME = "CovariateValue"; - public final static String COVARIATE_NAME_COLUMN_NAME = "CovariateName"; - public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; - public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; - - private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color - private static boolean warnUserNullPlatform = false; - - private static final String SCRIPT_FILE = "BQSR.R"; - - private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s"); - private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s"); - private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); - private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); - private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); - private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); - - /** - * Generates two lists : required covariates and optional covariates based on the user's requests. - * - * Performs the following tasks in order: - * 1. Adds all requierd covariates in order - * 2. Check if the user asked to use the standard covariates and adds them all if that's the case - * 3. Adds all covariates requested by the user that were not already added by the two previous steps - * - * @param argumentCollection the argument collection object for the recalibration walker - * @return a pair of ordered lists : required covariates (first) and optional covariates (second) - */ - public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { - final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); - final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); - final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); - - final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates - ArrayList optionalCovariates = new ArrayList(); - if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) - optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user - - // parse the -cov arguments that were provided, skipping over the ones already specified - if (argumentCollection.COVARIATES != null) { - for (String requestedCovariateString : argumentCollection.COVARIATES) { - // help the transition from BQSR v1 to BQSR v2 - if ( requestedCovariateString.equals("DinucCovariate") ) - throw new UserException.CommandLineException("DinucCovariate has been retired. Please use its successor covariate " + - "ContextCovariate instead, which includes the 2 bp (dinuc) substitution model of the retired DinucCovariate " + - "as well as an indel context to model the indel error rates"); - - boolean foundClass = false; - for (Class covClass : covariateClasses) { - if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class - foundClass = true; - if (!requiredClasses.contains(covClass) && - (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { - try { - final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it - optionalCovariates.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - } - } - - if (!foundClass) { - throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); - } - } - } - return new Pair, ArrayList>(requiredCovariates, optionalCovariates); - } - - /** - * Adds the required covariates to a covariate list - * - * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addRequiredCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - if (classes.size() != 2) - throw new ReviewedGATKException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); - - dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. - dest.add(new QualityScoreCovariate()); - return dest; - } - - /** - * Adds the standard covariates to a covariate list - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addStandardCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - for (Class covClass : classes) { - try { - final Covariate covariate = (Covariate) covClass.newInstance(); - dest.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - return dest; - } - - /** - * Print a list of all available covariates to logger as info - * - * @param logger - */ - public static void listAvailableCovariates(final Logger logger) { - logger.info("Available covariates:"); - for (final Class covClass : new PluginManager(Covariate.class).getPlugins()) { - logger.info(String.format("\t%30s\t%s", covClass.getSimpleName(), JVMUtils.classInterfaces(covClass))); - } - } - - /** - * Component used to print out csv representation of the reports that can be use to perform analysis in - * external tools. E.g. generate plots using R scripts. - *

- * A header is always printed into the output stream (or file) when the printer is created. Then you only need - * to call {@link #print(RecalibrationReport,String) print} for each report you want to include in the csv file. - * Once finished, you close the printer calling {@link #close() close} - * - */ - private static class CsvPrinter { - - private final PrintStream ps; - private final Covariate[] covariates; - - /** - * Constructs a printer redirected to an output file. - * @param out the output file. - * @param c covariates to print out. - * @throws FileNotFoundException if the file could not be created anew. - */ - protected CsvPrinter(final File out, final Covariate ... c) - throws FileNotFoundException { - this(new FileOutputStream(out), c); - } - - /** - * Constructs a printer redirected to an output stream - * @param os the output. - * @param c covariates to print out. - */ - protected CsvPrinter(final OutputStream os, final Covariate ... c) { - covariates = c == null ? new Covariate[0] : c.clone(); - ps = new PrintStream(os); - printHeader(); - } - - /** - * Prints the header out. - *

- * Should only be invoked at creation. - */ - protected void printHeader() { - RecalUtils.printHeader(ps); - } - - /** - * Prints out a report into the csv file. - * - * - * @param report the report to print out. - * @param mode the report associated mode. (typically ORIGINAL, RECALIBRATED - */ - public void print(final RecalibrationReport report, final String mode) { - RecalUtils.writeCSV(ps,report.getRecalibrationTables(),mode,covariates,false); - } - - /** - * Close the csv printer. - * - * No further output will be allowed or take place after calling this method. - */ - public void close() { - ps.close(); - } - - } - - /** - * Returns a csv output printer. - * - * @param out the output file. It will be overridden - * @param c list of covariates to print out. - * - * @throws FileNotFoundException if out could not be created anew. - * - * @return never null - */ - protected static CsvPrinter csvPrinter(final File out, final Covariate ... c) - throws FileNotFoundException - { - if (c == null) { - throw new IllegalArgumentException("the input covariate array cannot be null"); - } - return new CsvPrinter(out,c); - } - - /** - * Prints out a collection of reports into a file in Csv format in a way - * that can be used by R scripts (such as the plot generator script). - *

- * The set of covariates is take as the minimum common set from all reports. - * - * @param out the output file. It will be overridden. - * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) - * of each report and the corresponding value the report itself. - * @throws FileNotFoundException if out could not be created anew. - */ - public static void generateCsv(final File out, final Map reports) - throws FileNotFoundException { - if (reports.size() == 0) { - writeCsv(out, reports, new Covariate[0]); - } else { - final Iterator rit = reports.values().iterator(); - final RecalibrationReport first = rit.next(); - final Covariate[] firstCovariates = first.getRequestedCovariates(); - final Set covariates = new LinkedHashSet<>(); - Utils.addAll(covariates,firstCovariates); - while (rit.hasNext() && covariates.size() > 0) { - final Covariate[] nextCovariates = rit.next().getRequestedCovariates(); - final Set nextCovariateNames = new LinkedHashSet(nextCovariates.length); - for (final Covariate nc : nextCovariates) { - nextCovariateNames.add(nc.getClass().getSimpleName()); - } - final Iterator cit = covariates.iterator(); - while (cit.hasNext()) { - if (!nextCovariateNames.contains(cit.next().getClass().getSimpleName())) { - cit.remove(); - } - } - } - writeCsv(out, reports, covariates.toArray(new Covariate[covariates.size()])); - } - } - - /** - * Print out a collection of reports into a file in Csv format in a way - * that can be used by R scripts (such as the plot generator script). - * - * @param out - * @param reports map where keys are the unique 'mode' (ORIGINAL, RECALIBRATED, ...) - * of each report and the corresponding value the report itself. - * @param c the covariates to print out. - * @throws FileNotFoundException if out could not be created anew. - */ - private static void writeCsv(final File out, - final Map reports, final Covariate[] c) - throws FileNotFoundException { - final CsvPrinter p = csvPrinter(out,c); - for (Map.Entry e : reports.entrySet()) { - p.print(e.getValue(),e.getKey()); - } - p.close(); - } - - public enum SOLID_RECAL_MODE { - /** - * Treat reference inserted bases as reference matching bases. Very unsafe! - */ - DO_NOTHING, - /** - * Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. - */ - SET_Q_ZERO, - /** - * In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. - */ - SET_Q_ZERO_BASE_N, - /** - * Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. - */ - REMOVE_REF_BIAS; - - public static SOLID_RECAL_MODE recalModeFromString(String recalMode) { - if (recalMode.equals("DO_NOTHING")) - return SOLID_RECAL_MODE.DO_NOTHING; - if (recalMode.equals("SET_Q_ZERO")) - return SOLID_RECAL_MODE.SET_Q_ZERO; - if (recalMode.equals("SET_Q_ZERO_BASE_N")) - return SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N; - if (recalMode.equals("REMOVE_REF_BIAS")) - return SOLID_RECAL_MODE.REMOVE_REF_BIAS; - - throw new UserException.BadArgumentValue(recalMode, "is not a valid SOLID_RECAL_MODE value"); - } - } - - public enum SOLID_NOCALL_STRATEGY { - /** - * When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. - */ - THROW_EXCEPTION, - /** - * Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. - */ - LEAVE_READ_UNRECALIBRATED, - /** - * Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. - */ - PURGE_READ; - - public static SOLID_NOCALL_STRATEGY nocallStrategyFromString(String nocallStrategy) { - if (nocallStrategy.equals("THROW_EXCEPTION")) - return SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; - if (nocallStrategy.equals("LEAVE_READ_UNRECALIBRATED")) - return SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED; - if (nocallStrategy.equals("PURGE_READ")) - return SOLID_NOCALL_STRATEGY.PURGE_READ; - - throw new UserException.BadArgumentValue(nocallStrategy, "is not a valid SOLID_NOCALL_STRATEGY value"); - } - } - - private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { - List result = new LinkedList(); - int reportTableIndex = 0; - int rowIndex = 0; - final Map covariateNameMap = new HashMap(requestedCovariates.length); - for (final Covariate covariate : requestedCovariates) - covariateNameMap.put(covariate, parseCovariateName(covariate)); - - for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { - - final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - columnNames.add(covariateValue); - columnNames.add(covariateName); - } - } - - columnNames.add(eventType); // the order of these column names is important here - columnNames.add(empiricalQuality); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) - columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported - columnNames.add(nObservations); - columnNames.add(nErrors); - - final GATKReportTable reportTable; - if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - if(sortByCols) { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); - } else { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.DO_NOT_SORT); - } - for (final Pair columnName : columnNames) - reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); - rowIndex = 0; // reset the row index since we're starting with a new table - } else { - reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()); - } - - final NestedIntegerArray table = recalibrationTables.getTable(tableIndex); - for (final NestedIntegerArray.Leaf row : table.getAllLeaves()) { - final RecalDatum datum = (RecalDatum)row.value; - final int[] keys = row.keys; - - int columnIndex = 0; - int keyIndex = 0; - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++])); - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++])); - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { - final Covariate covariate = requestedCovariates[tableIndex]; - - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++])); - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariateNameMap.get(covariate)); - } - } - - final EventType event = EventType.eventFrom(keys[keyIndex]); - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString()); - - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); - reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); - - rowIndex++; - } - result.add(reportTable); - } - - return result; - } - - private static String parseCovariateName(final Covariate covariate) { - return covariate.getClass().getSimpleName().split("Covariate")[0]; - } - - /** - * Return a human-readable string representing the used covariates - * - * @param requestedCovariates a vector of covariates - * @return a non-null comma-separated string - */ - public static String covariateNames(final Covariate[] requestedCovariates) { - final List names = new ArrayList(requestedCovariates.length); - for ( final Covariate cov : requestedCovariates ) - names.add(cov.getClass().getSimpleName()); - return Utils.join(",", names); - } - - /** - * Outputs the GATK report to RAC.RECAL_TABLE. - * - * @param RAC The list of shared command line arguments - * @param quantizationInfo Quantization info - * @param recalibrationTables Recalibration tables - * @param requestedCovariates The list of requested covariates - * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT - */ - public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, boolean sortByCols) { - final GATKReport report = createRecalibrationGATKReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); - report.print(RAC.RECAL_TABLE); - } - - /** - * Creates a consolidated GATK report, first generating report tables. Report can then be written to a stream via GATKReport.print(PrintStream). - * - * @param argumentTable Argument table - * @param quantizationInfo Quantization info - * @param recalibrationTables Recalibration tables - * @param requestedCovariates The list of requested covariates - * @param sortByCols True to use GATKReportTable.TableSortingWay.SORT_BY_COLUMN, false to use GATKReportTable.TableSortingWay.DO_NOT_SORT - * @return GATK report - */ - public static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final boolean sortByCols) { - return createRecalibrationGATKReport(argumentTable, quantizationInfo.generateReportTable(sortByCols), generateReportTables(recalibrationTables, requestedCovariates, sortByCols)); - } - - /** - * Creates a consolidated GATK report from the tables. Report can then be written to a stream via GATKReport.print(PrintStream). - * - * @param argumentTable Argument table - * @param quantizationTable Quantization Table - * @param recalTables Other recal tables - * @return GATK report - */ - private static GATKReport createRecalibrationGATKReport(final GATKReportTable argumentTable, final GATKReportTable quantizationTable, final List recalTables) { - final GATKReport report = new GATKReport(); - report.addTable(argumentTable); - report.addTable(quantizationTable); - report.addTables(recalTables); - return report; - } - - /** s - * Write recalibration plots into a file - * - * @param csvFile location of the intermediary file - * @param exampleReportFile where the report arguments are collected from. - * @param output result plot file name. - */ - public static void generatePlots(final File csvFile, final File exampleReportFile, final File output) { - final RScriptExecutor executor = new RScriptExecutor(); - executor.setExceptOnError(true); - executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFile.getAbsolutePath()); - executor.addArgs(exampleReportFile.getAbsolutePath()); - executor.addArgs(output.getAbsolutePath()); - Logger.getLogger(RecalUtils.class).debug("R command line: " + executor.getApproximateCommandLine()); - executor.exec(); - } - - private static void outputRecalibrationPlot(final File csvFile, final RecalibrationArgumentCollection RAC) { - - final RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFile.getAbsolutePath()); - executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); - executor.exec(); - } - - /** - * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. - * - * @deprecated - */ - @Deprecated - public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { - generateRecalibrationPlot(RAC, original, null, requestedCovariates); - } - - /** - * Please use {@link #generateCsv(java.io.File, java.util.Map)} and {@link #generatePlots(java.io.File, java.io.File, java.io.File)} instead. - * - * @deprecated - */ - @Deprecated - public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { - final PrintStream csvStream; - final File csvTempFile = null; - try { - File csvTmpFile = File.createTempFile("BQSR",".csv"); - csvTmpFile.deleteOnExit(); - csvStream = new PrintStream(csvTmpFile); - } catch (IOException e) { - throw new UserException("Could not create temporary csv file", e); - } - - if ( recalibrated != null ) - writeCSV(csvStream, recalibrated, "RECALIBRATED", requestedCovariates, true); - writeCSV(csvStream, original, "ORIGINAL", requestedCovariates, recalibrated == null); - csvStream.close(); - outputRecalibrationPlot(csvTempFile, RAC); - csvTempFile.delete(); - } - - private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { - - final NestedIntegerArray deltaTable = createDeltaTable(recalibrationTables, requestedCovariates.length); - - // add the quality score table to the delta table - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table - final int[] newCovs = new int[4]; - newCovs[0] = leaf.keys[0]; - newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore - newCovs[2] = leaf.keys[1]; - newCovs[3] = leaf.keys[2]; - addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table - } - - // add the optional covariates to the delta table - for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(i); - for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { - final int[] covs = new int[4]; - covs[0] = leaf.keys[0]; - covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) - covs[2] = leaf.keys[2]; - covs[3] = leaf.keys[3]; - addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table - } - } - - // output the csv file - if (printHeader) { - printHeader(deltaTableFile); - } - - final Map covariateNameMap = new HashMap(requestedCovariates.length); - for (final Covariate covariate : requestedCovariates) - covariateNameMap.put(covariate, parseCovariateName(covariate)); - - // print each data line - for (final NestedIntegerArray.Leaf leaf : deltaTable.getAllLeaves()) { - final List deltaKeys = generateValuesFromKeys(leaf.keys, requestedCovariates, covariateNameMap); - final RecalDatum deltaDatum = leaf.value; - deltaTableFile.print(Utils.join(",", deltaKeys)); - deltaTableFile.print("," + deltaDatum.stringForCSV()); - deltaTableFile.println("," + recalibrationMode); - } - } - - private static void printHeader(PrintStream out) { - final List header = new LinkedList(); - header.add("ReadGroup"); - header.add("CovariateValue"); - header.add("CovariateName"); - header.add("EventType"); - header.add("Observations"); - header.add("Errors"); - header.add("EmpiricalQuality"); - header.add("AverageReportedQuality"); - header.add("Accuracy"); - header.add("Recalibration"); - out.println(Utils.join(",", header)); - } - - /* - * Return an initialized nested integer array with appropriate dimensions for use with the delta tables - * - * @param recalibrationTables the recal tables - * @param numCovariates the total number of covariates being used - * @return a non-null nested integer array - */ - @Requires("recalibrationTables != null && numCovariates > 0") - @Ensures("result != null") - private static NestedIntegerArray createDeltaTable(final RecalibrationTables recalibrationTables, final int numCovariates) { - - final int[] dimensionsForDeltaTable = new int[4]; - - // initialize the dimensions with those of the qual table to start with - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - final int[] dimensionsOfQualTable = qualTable.getDimensions(); - dimensionsForDeltaTable[0] = dimensionsOfQualTable[0]; // num read groups - dimensionsForDeltaTable[1] = numCovariates + 1; // num covariates - dimensionsForDeltaTable[2] = dimensionsOfQualTable[1]; - dimensionsForDeltaTable[3] = dimensionsOfQualTable[2]; - - // now, update the dimensions based on the optional covariate tables as needed - for ( int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < numCovariates; i++ ) { - final NestedIntegerArray covTable = recalibrationTables.getTable(i); - final int[] dimensionsOfCovTable = covTable.getDimensions(); - dimensionsForDeltaTable[2] = Math.max(dimensionsForDeltaTable[2], dimensionsOfCovTable[2]); - dimensionsForDeltaTable[3] = Math.max(dimensionsForDeltaTable[3], dimensionsOfCovTable[3]); - } - - return new NestedIntegerArray(dimensionsForDeltaTable); - } - - protected static List generateValuesFromKeys(final int[] keys, final Covariate[] covariates, final Map covariateNameMap) { - final List values = new ArrayList(4); - values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey(keys[0])); - - final int covariateIndex = keys[1]; - final int covariateKey = keys[2]; - final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; - values.add(covariate.formatKey(covariateKey)); - values.add(covariateNameMap.get(covariate)); - values.add(EventType.eventFrom(keys[3]).prettyPrint()); - - return values; - } - - /** - * Updates the current RecalDatum element in the delta table. - * - * If it doesn't have an element yet, it creates an RecalDatum element and adds it to the delta table. - * - * @param deltaTable the delta table - * @param deltaKey the key to the table - * @param recalDatum the recal datum to combine with the accuracyDatum element in the table - */ - private static void addToDeltaTable(final NestedIntegerArray deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { - final RecalDatum deltaDatum = deltaTable.get(deltaKey); // check if we already have a RecalDatum for this key - if (deltaDatum == null) - // if we don't have a key yet, create a new one with the same values as the current datum - deltaTable.put(new RecalDatum(recalDatum), deltaKey); - else - // if we do have a datum, combine it with this one - deltaDatum.combine(recalDatum); - } - - /** - * Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string - * - * @param read The read to adjust - * @param RAC The list of shared command line arguments - */ - public static void parsePlatformForRead(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) { - GATKSAMReadGroupRecord readGroup = read.getReadGroup(); - - if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) { - readGroup.setPlatform(RAC.FORCE_PLATFORM); - } - - if (readGroup.getPlatform() == null) { - if (RAC.DEFAULT_PLATFORM != null) { - if (!warnUserNullPlatform) { - Utils.warnUser("The input .bam file contains reads with no platform information. " + - "Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " + - "First observed at read with name = " + read.getReadName()); - warnUserNullPlatform = true; - } - readGroup.setPlatform(RAC.DEFAULT_PLATFORM); - } - else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName()); - } - } - } - - /** - * Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are - * inconsistent with the color space. If there is a no call in the color space, this method returns false meaning - * this read should be skipped - * - * @param strategy the strategy used for SOLID no calls - * @param read The SAMRecord to parse - * @return true if this read is consistent or false if this read should be skipped - */ - public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { - if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - return true; - - // Haven't calculated the inconsistency array yet for this read - if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { - final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); - if (attr != null) { - byte[] colorSpace; - if (attr instanceof String) - colorSpace = ((String) attr).getBytes(); - else - throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); - - final boolean badColor = hasNoCallInColorSpace(colorSpace); - if (badColor) { - if (strategy == SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) { - return false; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them - } - else if (strategy == SOLID_NOCALL_STRATEGY.PURGE_READ) { - read.setReadFailsVendorQualityCheckFlag(true); - return false; - } - } - - byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read - if (read.getReadNegativeStrandFlag()) - readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); - - final byte[] inconsistency = new byte[readBases.length]; - int i; - byte prevBase = colorSpace[0]; // The sentinel - for (i = 0; i < readBases.length; i++) { - final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); - inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); - prevBase = readBases[i]; - } - read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); - } - else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it - throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); - - else - return false; // otherwise, just skip the read - } - - return true; - } - - private static boolean hasNoCallInColorSpace(final byte[] colorSpace) { - final int length = colorSpace.length; - for (int i = 1; i < length; i++) { // skip the sentinal - final byte color = colorSpace[i]; - if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') { - return true; // There is a bad color in this SOLiD read - } - } - - return false; // There aren't any color no calls in this SOLiD read - } - - /** - * Given the base and the color calculate the next base in the sequence - * - * @param read the read - * @param prevBase The base - * @param color The color - * @return The next base in the sequence - */ - private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) { - switch (color) { - case '0': - return prevBase; - case '1': - return performColorOne(prevBase); - case '2': - return performColorTwo(prevBase); - case '3': - return performColorThree(prevBase); - default: - throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color + - " Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias."); - } - } - - /** - * Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality - * - * @param read The read which contains the color space to check against - * @param offset The offset in the read at which to check - * @return Returns true if the base was inconsistent with the color space - */ - public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { - final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG); - if (attr != null) { - final byte[] inconsistency = (byte[]) attr; - // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! - if (read.getReadNegativeStrandFlag()) { // Negative direction - return inconsistency[inconsistency.length - offset - 1] == (byte) 0; - } - else { // Forward direction - return inconsistency[offset] == (byte) 0; - } - - // This block of code is for if you want to check both the offset and the next base for color space inconsistency - //if( read.getReadNegativeStrandFlag() ) { // Negative direction - // if( offset == 0 ) { - // return inconsistency[0] != 0; - // } else { - // return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0); - // } - //} else { // Forward direction - // if( offset == inconsistency.length - 1 ) { - // return inconsistency[inconsistency.length - 1] != 0; - // } else { - // return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0); - // } - //} - - } - else { // No inconsistency array, so nothing is inconsistent - return true; - } - } - - /** - * Computes all requested covariates for every offset in the given read - * by calling covariate.getValues(..). - * - * It populates an array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. - * - * @param read The read for which to compute covariate values. - * @param requestedCovariates The list of requested covariates. - * @return a matrix with all the covariates calculated for every base in the read - */ - public static ReadCovariates computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates) { - final ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), requestedCovariates.length); - computeCovariates(read, requestedCovariates, readCovariates); - return readCovariates; - } - - /** - * Computes all requested covariates for every offset in the given read - * by calling covariate.getValues(..). - * - * It populates an array of covariate values where result[i][j] is the covariate - * value for the ith position in the read and the jth covariate in - * reqeustedCovariates list. - * - * @param read The read for which to compute covariate values. - * @param requestedCovariates The list of requested covariates. - * @param resultsStorage The object to store the covariate values - */ - public static void computeCovariates(final GATKSAMRecord read, final Covariate[] requestedCovariates, final ReadCovariates resultsStorage) { - // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read - for (int i = 0; i < requestedCovariates.length; i++) { - resultsStorage.setCovariateIndex(i); - requestedCovariates[i].recordValues(read, resultsStorage); - } - } - - /** - * Perform a certain transversion (A <-> C or G <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transversion of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorOne(byte base) { - switch (base) { - case 'A': - case 'a': - return 'C'; - case 'C': - case 'c': - return 'A'; - case 'G': - case 'g': - return 'T'; - case 'T': - case 't': - return 'G'; - default: - return base; - } - } - - /** - * Perform a transition (A <-> G or C <-> T) on the base. - * - * @param base the base [AaCcGgTt] - * @return the transition of the base, or the input base if it's not one of the understood ones - */ - private static byte performColorTwo(byte base) { - switch (base) { - case 'A': - case 'a': - return 'G'; - case 'C': - case 'c': - return 'T'; - case 'G': - case 'g': - return 'A'; - case 'T': - case 't': - return 'C'; - default: - return base; - } - } - - /** - * Return the complement (A <-> T or C <-> G) of a base. - * - * @param base the base [AaCcGgTt] - * @return the complementary base, or the input base if it's not one of the understood ones - */ - private static byte performColorThree(byte base) { - switch (base) { - case 'A': - case 'a': - return 'T'; - case 'C': - case 'c': - return 'G'; - case 'G': - case 'g': - return 'C'; - case 'T': - case 't': - return 'A'; - default: - return base; - } - } - - /** - * Combines the recalibration data for table1 and table2 into table1 - * - * Note that table1 is the destination, so it is modified - * - * @param table1 the destination table to merge table2 into - * @param table2 the source table to merge into table1 - */ - public static void combineTables(final NestedIntegerArray table1, final NestedIntegerArray table2) { - if ( table1 == null ) throw new IllegalArgumentException("table1 cannot be null"); - if ( table2 == null ) throw new IllegalArgumentException("table2 cannot be null"); - if ( ! Arrays.equals(table1.getDimensions(), table2.getDimensions())) - throw new IllegalArgumentException("Table1 " + Utils.join(",", table1.getDimensions()) + " not equal to " + Utils.join(",", table2.getDimensions())); - - for (final NestedIntegerArray.Leaf row : table2.getAllLeaves()) { - final RecalDatum myDatum = table1.get(row.keys); - - if (myDatum == null) - table1.put(row.value, row.keys); - else - myDatum.combine(row.value); - } - } - - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error value for this event - * @param keys location in table of our item - */ - public static void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, - final byte qual, - final double isError, - final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1L, isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(1L, isError); - } - } - - /** - * creates a datum object with one observation and one or zero error - * - * @param reportedQual the quality score reported by the instrument for this base - * @param isError whether or not the observation is an error - * @return a new RecalDatum object with the observation and the error - */ - private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReport.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReport.java deleted file mode 100644 index a2b83ccb6..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReport.java +++ /dev/null @@ -1,425 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; - -import java.io.*; -import java.util.*; - -/** - * This class has all the static functionality for reading a recalibration report file into memory. - * - * @author carneiro - * @since 3/26/12 - */ -public class RecalibrationReport { - private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private final RecalibrationTables recalibrationTables; // quick access reference to the tables - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation - private final HashMap optionalCovariateIndexes; - - private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes - private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter - - private final int[] tempRGarray = new int[2]; - private final int[] tempQUALarray = new int[3]; - private final int[] tempCOVarray = new int[4]; - - public RecalibrationReport(final File recalFile) { - this(recalFile, getReadGroups(recalFile)); - } - - public RecalibrationReport(final File recalFile, final SortedSet allReadGroups) { - final GATKReport report = new GATKReport(recalFile); - - argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); - RAC = initializeArgumentCollectionTable(argumentTable); - - GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); - quantizationInfo = initializeQuantizationTable(quantizedTable); - - Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates - ArrayList requiredCovariates = covariates.getFirst(); - ArrayList optionalCovariates = covariates.getSecond(); - requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; - optionalCovariateIndexes = new HashMap(optionalCovariates.size()); - int covariateIndex = 0; - for (final Covariate covariate : requiredCovariates) - requestedCovariates[covariateIndex++] = covariate; - for (final Covariate covariate : optionalCovariates) { - requestedCovariates[covariateIndex] = covariate; - final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport - optionalCovariateIndexes.put(covariateName, covariateIndex-2); - covariateIndex++; - } - - for (Covariate cov : requestedCovariates) - cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection - - recalibrationTables = new RecalibrationTables(requestedCovariates, allReadGroups.size()); - - initializeReadGroupCovariates(allReadGroups); - - parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable()); - - parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getQualityScoreTable()); - - parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables); - - } - - /** - * Gets the unique read groups in the recal file - * - * @param recalFile the recal file as a GATK Report - * @return the unique read groups - */ - public static SortedSet getReadGroups(final File recalFile) { - return getReadGroups(new GATKReport(recalFile)); - } - - /** - * Gets the unique read groups in the table - * - * @param report the GATKReport containing the table with RecalUtils.READGROUP_REPORT_TABLE_TITLE - * @return the unique read groups - */ - private static SortedSet getReadGroups(final GATKReport report) { - final GATKReportTable reportTable = report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); - final SortedSet readGroups = new TreeSet(); - for ( int i = 0; i < reportTable.getNumRows(); i++ ) - readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString()); - return readGroups; - } - - /** - * Combines two recalibration reports by adding all observations and errors - * - * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate - * them after combining. The reason for not calculating it is because this function is intended for combining a - * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized - * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, - * makes this method faster - * - * Note2: The empirical quality reported, however, is recalculated given its simplicity. - * - * @param other the recalibration report to combine with this one - */ - public void combine(final RecalibrationReport other) { - for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) { - final NestedIntegerArray myTable = recalibrationTables.getTable(tableIndex); - final NestedIntegerArray otherTable = other.recalibrationTables.getTable(tableIndex); - RecalUtils.combineTables(myTable, otherTable); - } - } - - public QuantizationInfo getQuantizationInfo() { - return quantizationInfo; - } - - public RecalibrationTables getRecalibrationTables() { - return recalibrationTables; - } - - public Covariate[] getRequestedCovariates() { - return requestedCovariates; - } - - /** - * Initialize read group keys using the shared list of all the read groups. - * - * By using the same sorted set of read groups across all recalibration reports, even if - * one report is missing a read group, all the reports use the same read group keys. - * - * @param allReadGroups The list of all possible read groups - */ - private void initializeReadGroupCovariates(final SortedSet allReadGroups) { - for (String readGroup: allReadGroups) { - requestedCovariates[0].keyFromValue(readGroup); - } - } - - /** - * Compiles the list of keys for the Covariates table and uses the shared parsing utility to produce the actual table - * - * @param reportTable the GATKReport table containing data for this table - * @param recalibrationTables the recalibration tables -\ */ - private void parseAllCovariatesTable(final GATKReportTable reportTable, final RecalibrationTables recalibrationTables) { - for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); - tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg); - final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); - tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual); - - final String covName = (String)reportTable.get(i, RecalUtils.COVARIATE_NAME_COLUMN_NAME); - final int covIndex = optionalCovariateIndexes.get(covName); - final Object covValue = reportTable.get(i, RecalUtils.COVARIATE_VALUE_COLUMN_NAME); - tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex].keyFromValue(covValue); - - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempCOVarray[3] = event.ordinal(); - - recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); - } - } - - /** - * - * Compiles the list of keys for the QualityScore table and uses the shared parsing utility to produce the actual table - * @param reportTable the GATKReport table containing data for this table - * @param qualTable the map representing this table - */ - private void parseQualityScoreTable(final GATKReportTable reportTable, final NestedIntegerArray qualTable) { - for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); - tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg); - final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); - tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempQUALarray[2] = event.ordinal(); - - qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); - } - } - - /** - * Compiles the list of keys for the ReadGroup table and uses the shared parsing utility to produce the actual table - * - * @param reportTable the GATKReport table containing data for this table - * @param rgTable the map representing this table - */ - private void parseReadGroupTable(final GATKReportTable reportTable, final NestedIntegerArray rgTable) { - for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); - tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempRGarray[1] = event.ordinal(); - - rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); - } - } - - private double asDouble(final Object o) { - if ( o instanceof Double ) - return (Double)o; - else if ( o instanceof Integer ) - return (Integer)o; - else if ( o instanceof Long ) - return (Long)o; - else - throw new ReviewedGATKException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass()); - } - - private long asLong(final Object o) { - if ( o instanceof Long ) - return (Long)o; - else if ( o instanceof Integer ) - return ((Integer)o).longValue(); - else if ( o instanceof Double ) - return ((Double)o).longValue(); - else - throw new ReviewedGATKException("Object " + o + " is expected to be a long but it's not: " + o.getClass()); - } - - private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { - final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); - final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); - //final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); - - // the estimatedQreported column only exists in the ReadGroup table - final double estimatedQReported = hasEstimatedQReportedColumn ? - (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table - Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table - - final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1); - datum.setEstimatedQReported(estimatedQReported); - //datum.setEmpiricalQuality(empiricalQuality); // don't set the value here because we will want to recompute with a different conditional Q score prior value - return datum; - } - - /** - * Parses the quantization table from the GATK Report and turns it into a map of original => quantized quality scores - * - * @param table the GATKReportTable containing the quantization mappings - * @return an ArrayList with the quantization mappings from 0 to MAX_SAM_QUAL_SCORE - */ - private QuantizationInfo initializeQuantizationTable(GATKReportTable table) { - final Byte[] quals = new Byte[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; - final Long[] counts = new Long[QualityUtils.MAX_SAM_QUAL_SCORE + 1]; - for ( int i = 0; i < table.getNumRows(); i++ ) { - final byte originalQual = (byte)i; - final Object quantizedObject = table.get(i, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); - final Object countObject = table.get(i, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); - final byte quantizedQual = Byte.parseByte(quantizedObject.toString()); - final long quantizedCount = Long.parseLong(countObject.toString()); - quals[originalQual] = quantizedQual; - counts[originalQual] = quantizedCount; - } - return new QuantizationInfo(Arrays.asList(quals), Arrays.asList(counts)); - } - - /** - * Parses the arguments table from the GATK Report and creates a RAC object with the proper initialization values - * - * @param table the GATKReportTable containing the arguments and its corresponding values - * @return a RAC object properly initialized with all the objects in the table - */ - private RecalibrationArgumentCollection initializeArgumentCollectionTable(GATKReportTable table) { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - for ( int i = 0; i < table.getNumRows(); i++ ) { - final String argument = table.get(i, "Argument").toString(); - Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); - if (value.equals("null")) - value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport - - if (argument.equals("covariate") && value != null) - RAC.COVARIATES = value.toString().split(","); - - else if (argument.equals("standard_covs")) - RAC.DO_NOT_USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value); - - else if (argument.equals("solid_recal_mode")) - RAC.SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.recalModeFromString((String) value); - - else if (argument.equals("solid_nocall_strategy")) - RAC.SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value); - - else if (argument.equals("mismatches_context_size")) - RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value); - - else if (argument.equals("indels_context_size")) - RAC.INDELS_CONTEXT_SIZE = Integer.parseInt((String) value); - - else if (argument.equals("mismatches_default_quality")) - RAC.MISMATCHES_DEFAULT_QUALITY = Byte.parseByte((String) value); - - else if (argument.equals("insertions_default_quality")) - RAC.INSERTIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); - - else if (argument.equals("deletions_default_quality")) - RAC.DELETIONS_DEFAULT_QUALITY = Byte.parseByte((String) value); - - else if (argument.equals("maximum_cycle_value")) - RAC.MAXIMUM_CYCLE_VALUE = Integer.parseInt((String) value); - - else if (argument.equals("low_quality_tail")) - RAC.LOW_QUAL_TAIL = Byte.parseByte((String) value); - - else if (argument.equals("default_platform")) - RAC.DEFAULT_PLATFORM = (String) value; - - else if (argument.equals("force_platform")) - RAC.FORCE_PLATFORM = (String) value; - - else if (argument.equals("quantizing_levels")) - RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); - - else if (argument.equals("recalibration_report")) - RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value); - - else if (argument.equals("binary_tag_name")) - RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value; - - else if (argument.equals("sort_by_all_columns")) - RAC.SORT_BY_ALL_COLUMNS = Boolean.parseBoolean((String) value); - } - - return RAC; - } - - /** - * this functionality avoids recalculating the empirical qualities, estimated reported quality - * and quantization of the quality scores during every call of combine(). Very useful for the BQSRGatherer. - */ - public void calculateQuantizedQualities() { - quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); - } - - /** - * Creates the recalibration report. Report can then be written to a stream via GATKReport.print(PrintStream). - * - * @return newly created recalibration report - */ - public GATKReport createGATKReport() { - return RecalUtils.createRecalibrationGATKReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); - } - - public RecalibrationArgumentCollection getRAC() { - return RAC; - } - - /** - * - * @deprecated use {@link #getRequestedCovariates()} instead. - */ - @Deprecated - public Covariate[] getCovariates() { - return requestedCovariates; - } - - /** - * @return true if the report has no data - */ - public boolean isEmpty() { - return recalibrationTables.isEmpty(); - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTables.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTables.java deleted file mode 100644 index e1c7820a4..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTables.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Ensures; -import org.broadinstitute.gatk.utils.collections.LoggingNestedIntegerArray; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; - -import java.io.PrintStream; -import java.util.ArrayList; - -/** - * Utility class to facilitate on-the-fly base quality score recalibration. - * - * User: ebanks - * Date: 6/20/12 - */ - -public final class RecalibrationTables { - public enum TableType { - READ_GROUP_TABLE, - QUALITY_SCORE_TABLE, - OPTIONAL_COVARIATE_TABLES_START; - } - - private final ArrayList> tables; - private final int qualDimension; - private final int eventDimension = EventType.values().length; - private final int numReadGroups; - private final PrintStream log; - - public RecalibrationTables(final Covariate[] covariates) { - this(covariates, covariates[TableType.READ_GROUP_TABLE.ordinal()].maximumKeyValue() + 1, null); - } - - public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { - this(covariates, numReadGroups, null); - } - - public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { - tables = new ArrayList>(covariates.length); - for ( int i = 0; i < covariates.length; i++ ) - tables.add(i, null); // initialize so we can set below - - qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.ordinal()].maximumKeyValue() + 1; - this.numReadGroups = numReadGroups; - this.log = log; - - tables.set(TableType.READ_GROUP_TABLE.ordinal(), - log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : - new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension)); - - tables.set(TableType.QUALITY_SCORE_TABLE.ordinal(), makeQualityScoreTable()); - - for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < covariates.length; i++) - tables.set(i, - log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : - new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + 1), - numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension)); - } - - @Ensures("result != null") - public NestedIntegerArray getReadGroupTable() { - return getTable(TableType.READ_GROUP_TABLE.ordinal()); - } - - @Ensures("result != null") - public NestedIntegerArray getQualityScoreTable() { - return getTable(TableType.QUALITY_SCORE_TABLE.ordinal()); - } - - @Ensures("result != null") - public NestedIntegerArray getTable(final int index) { - return tables.get(index); - } - - @Ensures("result >= 0") - public int numTables() { - return tables.size(); - } - - /** - * @return true if all the tables contain no RecalDatums - */ - public boolean isEmpty() { - for( final NestedIntegerArray table : tables ) { - if( !table.getAllValues().isEmpty() ) { return false; } - } - return true; - } - - /** - * Allocate a new quality score table, based on requested parameters - * in this set of tables, without any data in it. The return result - * of this table is suitable for acting as a thread-local cache - * for quality score values - * @return a newly allocated, empty read group x quality score table - */ - public NestedIntegerArray makeQualityScoreTable() { - return log == null - ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) - : new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); - } - - /** - * Merge all of the tables from toMerge into into this set of tables - */ - public void combine(final RecalibrationTables toMerge) { - if ( numTables() != toMerge.numTables() ) - throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes"); - - for ( int i = 0; i < numTables(); i++ ) { - final NestedIntegerArray myTable = this.getTable(i); - final NestedIntegerArray otherTable = toMerge.getTable(i); - RecalUtils.combineTables(myTable, otherTable); - } - } -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ContextCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ContextCovariate.java deleted file mode 100644 index 99264430d..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ContextCovariate.java +++ /dev/null @@ -1,304 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; -import org.broadinstitute.gatk.utils.clipping.ReadClipper; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 9/26/11 - */ - -public class ContextCovariate implements StandardCovariate { - private final static Logger logger = Logger.getLogger(ContextCovariate.class); - - - - private int mismatchesContextSize; - private int indelsContextSize; - - private int mismatchesKeyMask; - private int indelsKeyMask; - - private static final int LENGTH_BITS = 4; - private static final int LENGTH_MASK = 15; - - // the maximum context size (number of bases) permitted; we need to keep the leftmost base free so that values are - // not negative and we reserve 4 more bits to represent the length of the context; it takes 2 bits to encode one base. - static final private int MAX_DNA_CONTEXT = 13; - private byte LOW_QUAL_TAIL; - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - mismatchesContextSize = RAC.MISMATCHES_CONTEXT_SIZE; - indelsContextSize = RAC.INDELS_CONTEXT_SIZE; - - logger.info("\t\tContext sizes: base substitution model " + mismatchesContextSize + ", indel substitution model " + indelsContextSize); - - if (mismatchesContextSize > MAX_DNA_CONTEXT) - throw new UserException.BadArgumentValue("mismatches_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, mismatchesContextSize)); - if (indelsContextSize > MAX_DNA_CONTEXT) - throw new UserException.BadArgumentValue("indels_context_size", String.format("context size cannot be bigger than %d, but was %d", MAX_DNA_CONTEXT, indelsContextSize)); - - LOW_QUAL_TAIL = RAC.LOW_QUAL_TAIL; - - if (mismatchesContextSize <= 0 || indelsContextSize <= 0) - throw new UserException(String.format("Context size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Indels: %d", mismatchesContextSize, indelsContextSize)); - - mismatchesKeyMask = createMask(mismatchesContextSize); - indelsKeyMask = createMask(indelsContextSize); - } - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - - // store the original bases and then write Ns over low quality ones - final byte[] originalBases = read.getReadBases().clone(); - // Write N's over the low quality tail of the reads to avoid adding them into the context - final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); - - final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); - byte[] bases = clippedRead.getReadBases(); - if (negativeStrand) - bases = BaseUtils.simpleReverseComplement(bases); - - final ArrayList mismatchKeys = contextWith(bases, mismatchesContextSize, mismatchesKeyMask); - final ArrayList indelKeys = contextWith(bases, indelsContextSize, indelsKeyMask); - - final int readLength = bases.length; - - // this is necessary to ensure that we don't keep historical data in the ReadCovariates values - // since the context covariate may not span the entire set of values in read covariates - // due to the clipping of the low quality bases - if ( readLength != originalBases.length ) { - // don't both zeroing out if we are going to overwrite the whole array - for ( int i = 0; i < originalBases.length; i++ ) - // this base has been clipped off, so zero out the covariate values here - values.addCovariate(0, 0, 0, i); - } - - for (int i = 0; i < readLength; i++) { - final int readOffset = (negativeStrand ? readLength - i - 1 : i); - final int indelKey = indelKeys.get(i); - values.addCovariate(mismatchKeys.get(i), indelKey, indelKey, readOffset); - } - - // put the original bases back in - read.setReadBases(originalBases); - } - - // Used to get the covariate's value from input csv file during on-the-fly recalibration - @Override - public final Object getValue(final String str) { - return str; - } - - @Override - public String formatKey(final int key) { - if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file - return null; - - return contextFromKey(key); - } - - @Override - public int keyFromValue(final Object value) { - return keyFromContext((String) value); - } - - private static int createMask(final int contextSize) { - int mask = 0; - // create 2*contextSize worth of bits - for (int i = 0; i < contextSize; i++) - mask = (mask << 2) | 3; - // shift 4 bits to mask out the bits used to encode the length - return mask << LENGTH_BITS; - } - - /** - * calculates the context of a base independent of the covariate mode (mismatch, insertion or deletion) - * - * @param bases the bases in the read to build the context from - * @param contextSize context size to use building the context - * @param mask mask for pulling out just the context bits - */ - private static ArrayList contextWith(final byte[] bases, final int contextSize, final int mask) { - - final int readLength = bases.length; - final ArrayList keys = new ArrayList(readLength); - - // the first contextSize-1 bases will not have enough previous context - for (int i = 1; i < contextSize && i <= readLength; i++) - keys.add(-1); - - if (readLength < contextSize) - return keys; - - final int newBaseOffset = 2 * (contextSize - 1) + LENGTH_BITS; - - // get (and add) the key for the context starting at the first base - int currentKey = keyFromContext(bases, 0, contextSize); - keys.add(currentKey); - - // if the first key was -1 then there was an N in the context; figure out how many more consecutive contexts it affects - int currentNPenalty = 0; - if (currentKey == -1) { - currentKey = 0; - currentNPenalty = contextSize - 1; - int offset = newBaseOffset; - while (bases[currentNPenalty] != 'N') { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentNPenalty]); - currentKey |= (baseIndex << offset); - offset -= 2; - currentNPenalty--; - } - } - - for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); - if (baseIndex == -1) { // ignore non-ACGT bases - currentNPenalty = contextSize; - currentKey = 0; // reset the key - } else { - // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in - currentKey = (currentKey >> 2) & mask; - currentKey |= (baseIndex << newBaseOffset); - currentKey |= contextSize; - } - - if (currentNPenalty == 0) { - keys.add(currentKey); - } else { - currentNPenalty--; - keys.add(-1); - } - } - - return keys; - } - - public static int keyFromContext(final String dna) { - return keyFromContext(dna.getBytes(), 0, dna.length()); - } - - /** - * Creates a int representation of a given dna string. - * - * @param dna the dna sequence - * @param start the start position in the byte array (inclusive) - * @param end the end position in the array (exclusive) - * @return the key representing the dna sequence - */ - private static int keyFromContext(final byte[] dna, final int start, final int end) { - - int key = end - start; - int bitOffset = LENGTH_BITS; - for (int i = start; i < end; i++) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); - if (baseIndex == -1) // ignore non-ACGT bases - return -1; - key |= (baseIndex << bitOffset); - bitOffset += 2; - } - return key; - } - - /** - * Converts a key into the dna string representation. - * - * @param key the key representing the dna sequence - * @return the dna sequence represented by the key - */ - public static String contextFromKey(final int key) { - if (key < 0) - throw new ReviewedGATKException("dna conversion cannot handle negative numbers. Possible overflow?"); - - final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context - int mask = 48; // use the mask to pull out bases - int offset = LENGTH_BITS; - - StringBuilder dna = new StringBuilder(); - for (int i = 0; i < length; i++) { - final int baseIndex = (key & mask) >> offset; - dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); - mask = mask << 2; // move the mask over to the next 2 bits - offset += 2; - } - - return dna.toString(); - } - - @Override - public int maximumKeyValue() { - // the maximum value is T (11 in binary) for each base in the context - int length = Math.max(mismatchesContextSize, indelsContextSize); // the length of the context - int key = length; - int bitOffset = LENGTH_BITS; - for (int i = 0; i DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); - private static final EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE; - - if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM)) - throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform."); - - if (RAC.DEFAULT_PLATFORM != null) - default_platform = RAC.DEFAULT_PLATFORM; - } - - // Used to pick out the covariate's value from attributes of the read - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - final int readLength = read.getReadLength(); - final NGSPlatform ngsPlatform = default_platform == null ? read.getNGSPlatform() : NGSPlatform.fromReadGroupPL(default_platform); - - // Discrete cycle platforms - if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) { - final int readOrderFactor = read.getReadPairedFlag() && read.getSecondOfPairFlag() ? -1 : 1; - final int increment; - int cycle; - if (read.getReadNegativeStrandFlag()) { - cycle = readLength * readOrderFactor; - increment = -1 * readOrderFactor; - } - else { - cycle = readOrderFactor; - increment = readOrderFactor; - } - - final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1; - for (int i = 0; i < readLength; i++) { - final int substitutionKey = keyFromCycle(cycle); - final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey; - values.addCovariate(substitutionKey, indelKey, indelKey, i); - cycle += increment; - } - } - - // Flow cycle platforms - else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) { - - final byte[] bases = read.getReadBases(); - - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); - - int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. - - // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change - // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if (!read.getReadNegativeStrandFlag()) { // Forward direction - int iii = 0; - while (iii < readLength) { - while (iii < readLength && bases[iii] == (byte) 'T') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - while (iii < readLength && bases[iii] == (byte) 'A') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - while (iii < readLength && bases[iii] == (byte) 'C') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - while (iii < readLength && bases[iii] == (byte) 'G') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - if (iii < readLength) { - if (multiplyByNegative1) - cycle--; - else - cycle++; - } - if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii++; - } - - } - } - else { // Negative direction - int iii = readLength - 1; - while (iii >= 0) { - while (iii >= 0 && bases[iii] == (byte) 'T') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - while (iii >= 0 && bases[iii] == (byte) 'A') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - while (iii >= 0 && bases[iii] == (byte) 'C') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - while (iii >= 0 && bases[iii] == (byte) 'G') { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - if (iii >= 0) { - if (multiplyByNegative1) - cycle--; - else - cycle++; - } - if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) { - final int key = keyFromCycle(cycle); - values.addCovariate(key, key, key, iii); - iii--; - } - } - } - } - - // Unknown platforms - else { - throw new UserException("The platform (" + read.getReadGroup().getPlatform() - + ") associated with read group " + read.getReadGroup() - + " is not a recognized platform. Allowable options are " + NGSPlatform.knownPlatformsString()); - } - } - - // Used to get the covariate's value from input csv file during on-the-fly recalibration - @Override - public final Object getValue(final String str) { - return Integer.parseInt(str); - } - - @Override - public String formatKey(final int key) { - int cycle = key >> 1; // shift so we can remove the "sign" bit - if ( (key & 1) != 0 ) // is the last bit set? - cycle *= -1; // then the cycle is negative - return String.format("%d", cycle); - } - - @Override - public int keyFromValue(final Object value) { - return (value instanceof String) ? keyFromCycle(Integer.parseInt((String) value)) : keyFromCycle((Integer) value); - } - - @Override - public int maximumKeyValue() { - return (MAXIMUM_CYCLE_VALUE << 1) + 1; - } - - private int keyFromCycle(final int cycle) { - // no negative values because values must fit into the first few bits of the long - int result = Math.abs(cycle); - if ( result > MAXIMUM_CYCLE_VALUE ) - throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle (" + result + ") was detected. Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)"); - - result = result << 1; // shift so we can add the "sign" bit - if ( cycle < 0 ) - result++; // negative cycles get the lower-most bit set - return result; - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ExperimentalCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ExperimentalCovariate.java deleted file mode 100644 index 771c49771..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ExperimentalCovariate.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * [Short one sentence description of this walker] - *

- *

- * [Functionality of this walker] - *

- *

- *

Input

- *

- * [Input description] - *

- *

- *

Output

- *

- * [Output description] - *

- *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T $WalkerName
- *  
- * - * @author Your Name - * @since Date created - */ -public interface ExperimentalCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/QualityScoreCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/QualityScoreCovariate.java deleted file mode 100644 index e31588468..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/QualityScoreCovariate.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Nov 3, 2009 - * - * The Reported Quality Score covariate. - */ - -public class QualityScoreCovariate implements RequiredCovariate { - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) {} - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - final byte[] baseQualities = read.getBaseQualities(); - final byte[] baseInsertionQualities = read.getBaseInsertionQualities(); - final byte[] baseDeletionQualities = read.getBaseDeletionQualities(); - - for (int i = 0; i < baseQualities.length; i++) { - values.addCovariate((int)baseQualities[i], (int)baseInsertionQualities[i], (int)baseDeletionQualities[i], i); - } - } - - // Used to get the covariate's value from input csv file during on-the-fly recalibration - @Override - public final Object getValue(final String str) { - return Byte.parseByte(str); - } - - @Override - public String formatKey(final int key) { - return String.format("%d", key); - } - - @Override - public int keyFromValue(final Object value) { - return (value instanceof String) ? (int)Byte.parseByte((String) value) : (int)(Byte) value; - } - - @Override - public int maximumKeyValue() { - return QualityUtils.MAX_SAM_QUAL_SCORE; - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ReadGroupCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ReadGroupCovariate.java deleted file mode 100644 index 9eadcf458..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/ReadGroupCovariate.java +++ /dev/null @@ -1,190 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Oct 30, 2009 - * - * The Read Group covariate. - */ - -public class ReadGroupCovariate implements RequiredCovariate { - - private final HashMap readGroupLookupTable = new HashMap(); - private final HashMap readGroupReverseLookupTable = new HashMap(); - private int nextId = 0; - private String forceReadGroup; - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - forceReadGroup = RAC.FORCE_READGROUP; - } - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - final String readGroupId = readGroupValueFromRG(read.getReadGroup()); - final int key = keyForReadGroup(readGroupId); - - final int l = read.getReadLength(); - for (int i = 0; i < l; i++) - values.addCovariate(key, key, key, i); - } - - @Override - public final Object getValue(final String str) { - return str; - } - - @Override - public synchronized String formatKey(final int key) { - // This method is synchronized so that we don't attempt to do a get() - // from the reverse lookup table while that table is being updated - return readGroupReverseLookupTable.get(key); - } - - @Override - public int keyFromValue(final Object value) { - return keyForReadGroup((String) value); - } - - /** - * Get the mapping from read group names to integer key values for all read groups in this covariate - * @return a set of mappings from read group names -> integer key values - */ - public Set> getKeyMap() { - return readGroupLookupTable.entrySet(); - } - - private int keyForReadGroup(final String readGroupId) { - // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), - // synchronize only the table updates. - - // Before entering the synchronized block, check to see if this read group is not in our tables. - // If it's not, either we will have to insert it, OR another thread will insert it first. - // This preliminary check avoids doing any synchronization most of the time. - if ( ! readGroupLookupTable.containsKey(readGroupId) ) { - - synchronized ( this ) { - - // Now we need to make sure the key is STILL not there, since another thread may have come along - // and inserted it while we were waiting to enter this synchronized block! - if ( ! readGroupLookupTable.containsKey(readGroupId) ) { - readGroupLookupTable.put(readGroupId, nextId); - readGroupReverseLookupTable.put(nextId, readGroupId); - nextId++; - } - } - } - - return readGroupLookupTable.get(readGroupId); - } - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - return readGroupLookupTable.size() - 1; - } - - /** - * If the sample has a PU tag annotation, return that. If not, return the read group id. - * - * @param rg the read group record - * @return platform unit or readgroup id - */ - private String readGroupValueFromRG(final GATKSAMReadGroupRecord rg) { - if ( forceReadGroup != null ) - return forceReadGroup; - - final String platformUnit = rg.getPlatformUnit(); - return platformUnit == null ? rg.getId() : platformUnit; - } - -} - - diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java deleted file mode 100644 index 1cb4be39c..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatCovariate.java +++ /dev/null @@ -1,285 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.collections.Pair; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -public abstract class RepeatCovariate implements ExperimentalCovariate { - protected int MAX_REPEAT_LENGTH; - protected int MAX_STR_UNIT_LENGTH; - private final HashMap repeatLookupTable = new HashMap(); - private final HashMap repeatReverseLookupTable = new HashMap(); - private int nextId = 0; - - // Initialize any member variables using the command-line arguments passed to the walkers - @Override - public void initialize(final RecalibrationArgumentCollection RAC) { - MAX_STR_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; - MAX_REPEAT_LENGTH = RAC.MAX_REPEAT_LENGTH; - } - - public void initialize(final int MAX_STR_UNIT_LENGTH, final int MAX_REPEAT_LENGTH) { - this.MAX_STR_UNIT_LENGTH = MAX_STR_UNIT_LENGTH; - this.MAX_REPEAT_LENGTH = MAX_REPEAT_LENGTH; - } - - @Override - public void recordValues(final GATKSAMRecord read, final ReadCovariates values) { - // store the original bases and then write Ns over low quality ones - final byte[] originalBases = read.getReadBases().clone(); - - final boolean negativeStrand = read.getReadNegativeStrandFlag(); - byte[] bases = read.getReadBases(); - if (negativeStrand) - bases = BaseUtils.simpleReverseComplement(bases); - - // don't record reads with N's - if (!BaseUtils.isAllRegularBases(bases)) - return; - - for (int i = 0; i < bases.length; i++) { - final Pair res = findTandemRepeatUnits(bases, i); - // to merge repeat unit and repeat length to get covariate value: - final String repeatID = getCovariateValueFromUnitAndLength(res.first, res.second); - final int key = keyForRepeat(repeatID); - - final int readOffset = (negativeStrand ? bases.length - i - 1 : i); - values.addCovariate(key, key, key, readOffset); - } - - // put the original bases back in - read.setReadBases(originalBases); - - } - - public Pair findTandemRepeatUnits(byte[] readBases, int offset) { - int maxBW = 0; - byte[] bestBWRepeatUnit = new byte[]{readBases[offset]}; - for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { - // fix repeat unit length - //edge case: if candidate tandem repeat unit falls beyond edge of read, skip - if (offset+1-str < 0) - break; - - // get backward repeat unit and # repeats - byte[] backwardRepeatUnit = Arrays.copyOfRange(readBases, offset - str + 1, offset + 1); - maxBW = GATKVariantContextUtils.findNumberOfRepetitions(backwardRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); - if (maxBW > 1) { - bestBWRepeatUnit = backwardRepeatUnit.clone(); - break; - } - } - byte[] bestRepeatUnit = bestBWRepeatUnit; - int maxRL = maxBW; - - if (offset < readBases.length-1) { - byte[] bestFWRepeatUnit = new byte[]{readBases[offset+1]}; - int maxFW = 0; - for (int str = 1; str <= MAX_STR_UNIT_LENGTH; str++) { - // fix repeat unit length - //edge case: if candidate tandem repeat unit falls beyond edge of read, skip - if (offset+str+1 > readBases.length) - break; - - // get forward repeat unit and # repeats - byte[] forwardRepeatUnit = Arrays.copyOfRange(readBases, offset +1, offset+str+1); - maxFW = GATKVariantContextUtils.findNumberOfRepetitions(forwardRepeatUnit, Arrays.copyOfRange(readBases, offset + 1, readBases.length), true); - if (maxFW > 1) { - bestFWRepeatUnit = forwardRepeatUnit.clone(); - break; - } - } - // if FW repeat unit = BW repeat unit it means we're in the middle of a tandem repeat - add FW and BW components - if (Arrays.equals(bestFWRepeatUnit, bestBWRepeatUnit)) { - maxRL = maxBW + maxFW; - bestRepeatUnit = bestFWRepeatUnit; // arbitrary - } - else { - // tandem repeat starting forward from current offset. - // It could be the case that best BW unit was differnet from FW unit, but that BW still contains FW unit. - // For example, TTCTT(C) CCC - at (C) place, best BW unit is (TTC)2, best FW unit is (C)3. - // but correct representation at that place might be (C)4. - // Hence, if the FW and BW units don't match, check if BW unit can still be a part of FW unit and add - // representations to total - maxBW = GATKVariantContextUtils.findNumberOfRepetitions(bestFWRepeatUnit, Arrays.copyOfRange(readBases, 0, offset + 1), false); - maxRL = maxFW + maxBW; - bestRepeatUnit = bestFWRepeatUnit; - - } - - } - - - - if(maxRL > MAX_REPEAT_LENGTH) { maxRL = MAX_REPEAT_LENGTH; } - return new Pair(bestRepeatUnit, maxRL); - - } - @Override - public final Object getValue(final String str) { - return str; - } - - @Override - public synchronized String formatKey(final int key) { - // This method is synchronized so that we don't attempt to do a get() - // from the reverse lookup table while that table is being updated - return repeatReverseLookupTable.get(key); - } - - @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) - @Ensures("result != null") - protected abstract String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength); - - - @Override - public int keyFromValue(final Object value) { - return keyForRepeat((String) value); - } - - /** - * Get the mapping from read group names to integer key values for all read groups in this covariate - * @return a set of mappings from read group names -> integer key values - */ - public Set> getKeyMap() { - return repeatLookupTable.entrySet(); - } - - private int keyForRepeat(final String repeatID) { - // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), - // synchronize only the table updates. - - // Before entering the synchronized block, check to see if this read group is not in our tables. - // If it's not, either we will have to insert it, OR another thread will insert it first. - // This preliminary check avoids doing any synchronization most of the time. - if ( ! repeatLookupTable.containsKey(repeatID) ) { - - synchronized ( this ) { - - // Now we need to make sure the key is STILL not there, since another thread may have come along - // and inserted it while we were waiting to enter this synchronized block! - if ( ! repeatLookupTable.containsKey(repeatID) ) { - repeatLookupTable.put(repeatID, nextId); - repeatReverseLookupTable.put(nextId, repeatID); - nextId++; - } - } - } - - return repeatLookupTable.get(repeatID); - } - - - /** - * Splits repeat unit and num repetitions from covariate value. - * For example, if value if "ATG4" it returns (ATG,4) - * @param value Covariate value - * @return Split pair - */ - @Requires("value != null") - @Ensures({"result.first != null","result.second>=0"}) - public static Pair getRUandNRfromCovariate(final String value) { - - int k = 0; - for ( k=0; k < value.length(); k++ ) { - if (!BaseUtils.isRegularBase(value.getBytes()[k])) - break; - } - Integer nr = Integer.valueOf(value.substring(k,value.length())); // will throw NumberFormatException if format illegal - if (k == value.length() || nr <= 0) - throw new IllegalStateException("Covariate is not of form (Repeat Unit) + Integer"); - - return new Pair(value.substring(0,k), nr); - } - - /** - * Gets bases from tandem repeat representation (Repeat Unit),(Number of Repeats). - * For example, (AGC),3 returns AGCAGCAGC - * @param repeatUnit Tandem repeat unit - * @param numRepeats Number of repeats - * @return Expanded String - */ - @Requires({"numRepeats > 0","repeatUnit != null"}) - @Ensures("result != null") - public static String getBasesFromRUandNR(final String repeatUnit, final int numRepeats) { - final StringBuilder sb = new StringBuilder(); - - for (int i=0; i < numRepeats; i++) - sb.append(repeatUnit); - - return sb.toString(); - } - - // version given covariate key - public static String getBasesFromRUandNR(final String covariateValue) { - Pair pair = getRUandNRfromCovariate(covariateValue); - return getBasesFromRUandNR(pair.getFirst(), pair.getSecond()); - } - - @Override - public abstract int maximumKeyValue(); - - - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatLengthCovariate.java deleted file mode 100644 index 398633062..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatLengthCovariate.java +++ /dev/null @@ -1,74 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -public class RepeatLengthCovariate extends RepeatCovariate { - - @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) - @Ensures("result != null") - protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { - return String.format("%d",repeatLength); - } - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - //return repeatLookupTable.size() - 1; - // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, - // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values - return (1+MAX_REPEAT_LENGTH); - } - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java deleted file mode 100644 index 345ef0d7d..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitAndLengthCovariate.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - - -public class RepeatUnitAndLengthCovariate extends RepeatCovariate { - - @Requires({"repeatLength>=0", "repeatFromUnitAndLength != null"}) - @Ensures("result != null") - protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { - return new String(repeatFromUnitAndLength) + String.format("%d",repeatLength); - } - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - //return repeatLookupTable.size() - 1; - // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, - // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values - return (1<<(2*MAX_STR_UNIT_LENGTH)) * MAX_REPEAT_LENGTH +1; - } - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitCovariate.java deleted file mode 100644 index b1b0ca457..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RepeatUnitCovariate.java +++ /dev/null @@ -1,78 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * Created with IntelliJ IDEA. - * User: rpoplin - * Date: 11/3/12 - */ - -public class RepeatUnitCovariate extends RepeatCovariate { - - protected String getCovariateValueFromUnitAndLength(final byte[] repeatFromUnitAndLength, final int repeatLength) { - return new String(repeatFromUnitAndLength); - - } - - - @Override - public synchronized int maximumKeyValue() { - // Synchronized so that we don't query table size while the tables are being updated - //return repeatLookupTable.size() - 1; - // max possible values of covariate: for repeat unit, length is up to MAX_STR_UNIT_LENGTH, - // so we have 4^MAX_STR_UNIT_LENGTH * MAX_REPEAT_LENGTH possible values - return (1<<(2*MAX_STR_UNIT_LENGTH)) +1; - } - - -} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RequiredCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RequiredCovariate.java deleted file mode 100644 index e30df7dd2..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/RequiredCovariate.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * [Short one sentence description of this walker] - *

- *

- * [Functionality of this walker] - *

- *

- *

Input

- *

- * [Input description] - *

- *

- *

Output

- *

- * [Output description] - *

- *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T $WalkerName
- *  
- * - * @author Your Name - * @since Date created - */ -public interface RequiredCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/StandardCovariate.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/StandardCovariate.java deleted file mode 100644 index 4e40f7d49..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/recalibration/covariates/StandardCovariate.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration.covariates; - -/** - * [Short one sentence description of this walker] - *

- *

- * [Functionality of this walker] - *

- *

- *

Input

- *

- * [Input description] - *

- *

- *

Output

- *

- * [Output description] - *

- *

- *

Examples

- *
- *    java
- *      -jar GenomeAnalysisTK.jar
- *      -T $WalkerName
- *  
- * - * @author Your Name - * @since Date created - */ -public interface StandardCovariate extends Covariate {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/variant/ReferenceConfidenceVariantContextMerger.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/variant/ReferenceConfidenceVariantContextMerger.java deleted file mode 100644 index 6676650c8..000000000 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/utils/variant/ReferenceConfidenceVariantContextMerger.java +++ /dev/null @@ -1,417 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.variant.variantcontext.*; -import htsjdk.variant.vcf.VCFConstants; -import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.*; - -/** - * Variant context utilities related to merging variant-context instances. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class ReferenceConfidenceVariantContextMerger { - - private static Comparable combineAnnotationValues( final List array ) { - return MathUtils.median(array); // right now we take the median but other options could be explored - } - - /** - * Merges VariantContexts from gVCFs into a single hybrid. - * Assumes that none of the input records are filtered. - * - * @param VCs collection of unsorted genomic VCs - * @param loc the current location - * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case - * @param removeNonRefSymbolicAllele if true, remove the allele from the merged VC - * @return new VariantContext representing the merge of all VCs or null if it not relevant - */ - public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele) { - // this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs - if ( VCs == null || VCs.size() == 0 ) - return null; - - // establish the baseline info (sometimes from the first VC) - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - - // ref allele - final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase); - if ( refAllele == null ) - return null; - - // FinalAlleleSet contains the alleles of the new resulting VC - // Using linked set in order to guarantee a stable order - final LinkedHashSet finalAlleleSet = new LinkedHashSet<>(10); - // Reference goes first - finalAlleleSet.add(refAllele); - - final Map attributes = new LinkedHashMap<>(); - final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - int depth = 0; - final Map> annotationMap = new LinkedHashMap<>(); - final GenotypesContext genotypes = GenotypesContext.create(); - - final int variantContextCount = VCs.size(); - // In this list we hold the mapping of each variant context alleles. - final List>> vcAndNewAllelePairs = new ArrayList<>(variantContextCount); - // cycle through and add info from the other VCs - for ( final VariantContext vc : VCs ) { - - // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) - final boolean isSpanningEvent = loc.getStart() != vc.getStart(); - - vcAndNewAllelePairs.add(new Pair<>(vc,isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) - : remapAlleles(vc.getAlleles(), refAllele, finalAlleleSet))); - } - - // Add to the end if at all required in in the output. - if (!removeNonRefSymbolicAllele) finalAlleleSet.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - - final List allelesList = new ArrayList<>(finalAlleleSet); - - for ( final Pair> pair : vcAndNewAllelePairs ) { - final VariantContext vc = pair.getFirst(); - final List remappedAlleles = pair.getSecond(); - - mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, allelesList); - - // special case DP (add it up) for all events - if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) { - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - } else { // handle the gVCF case from the HaplotypeCaller - for( final Genotype gt : vc.getGenotypes() ) { - depth += (gt.hasExtendedAttribute("MIN_DP") ? Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) : (gt.hasDP() ? gt.getDP() : 0)); - } - } - - if ( loc.getStart() != vc.getStart() ) - continue; - - // special case ID (just preserve it) - if ( vc.hasID() ) rsIDs.add(vc.getID()); - - // add attributes - addReferenceConfidenceAttributes(vc.getAttributes(), annotationMap); - } - - // when combining annotations use the median value from all input VCs which had annotations provided - for ( final Map.Entry> p : annotationMap.entrySet() ) { - if ( ! p.getValue().isEmpty() ) { - attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - // remove stale AC and AF based attributes - removeStaleAttributesAfterMerge(attributes); - - final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(allelesList) - .chr(loc.getContig()).start(loc.getStart()).computeEndFromAlleles(allelesList, loc.getStart(), loc.getStart()) - .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to re-genotype later - - return builder.make(); - } - - /** - * Determines the ref allele given the provided reference base at this position - * - * @param VCs collection of unsorted genomic VCs - * @param loc the current location - * @param refBase the reference allele to use if all contexts in the VC are spanning - * @return new Allele or null if no reference allele/base is available - */ - private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { - final Allele refAllele = GATKVariantContextUtils.determineReferenceAllele(VCs, loc); - if ( refAllele == null ) - return ( refBase == null ? null : Allele.create(refBase, true) ); - return refAllele; - } - - /** - * Remove the stale attributes from the merged set - * - * @param attributes the attribute map - */ - private static void removeStaleAttributesAfterMerge(final Map attributes) { - attributes.remove(VCFConstants.ALLELE_COUNT_KEY); - attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY); - attributes.remove(VCFConstants.ALLELE_NUMBER_KEY); - attributes.remove(VCFConstants.MLE_ALLELE_COUNT_KEY); - attributes.remove(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); - attributes.remove(VCFConstants.END_KEY); - } - - /** - * Adds attributes to the global map from the new context in a sophisticated manner - * - * @param myAttributes attributes to add from - * @param annotationMap map of annotations for combining later - */ - private static void addReferenceConfidenceAttributes(final Map myAttributes, - final Map> annotationMap) { - for ( final Map.Entry p : myAttributes.entrySet() ) { - final String key = p.getKey(); - final Object value = p.getValue(); - - // add the annotation values to a list for combining later - List values = annotationMap.get(key); - if( values == null ) { - values = new ArrayList<>(); - annotationMap.put(key, values); - } - try { - final String stringValue = value.toString(); - // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. - if (stringValue.contains(".")) - values.add(Double.parseDouble(stringValue)); - else - values.add(Integer.parseInt(stringValue)); - } catch (final NumberFormatException e) { - // nothing to do - } - } - } - - /** - * This method does a couple of things: - *
  • - * remaps the vc alleles considering the differences between the final reference allele and its own reference,
  • - *
  • - * collects alternative alleles present in variant context and add them to the {@code finalAlleles} set. - *
- * - * @param vcAlleles the variant context allele list. - * @param refAllele final reference allele. - * @param finalAlleles where to add the final set of non-ref called alleles. - * @return never {@code null} - */ - //TODO as part of a larger refactoring effort {@link #remapAlleles} can be merged with {@link GATKVariantContextUtils#remapAlleles}. - private static List remapAlleles(final List vcAlleles, final Allele refAllele, final LinkedHashSet finalAlleles) { - final Allele vcRef = vcAlleles.get(0); - if (!vcRef.isReference()) throw new IllegalStateException("the first allele of the vc allele list must be reference"); - final byte[] refBases = refAllele.getBases(); - final int extraBaseCount = refBases.length - vcRef.getBases().length; - if (extraBaseCount < 0) throw new IllegalStateException("the wrong reference was selected"); - final List result = new ArrayList<>(vcAlleles.size()); - - for (final Allele a : vcAlleles) { - if (a.isReference()) { - result.add(refAllele); - } else if (a.isSymbolic()) { - result.add(a); - // we always skip when adding to finalAlleles this is done outside if applies. - if (!a.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)) - finalAlleles.add(a); - } else if (a.isCalled()) { - final Allele newAllele; - if (extraBaseCount > 0) { - final byte[] oldBases = a.getBases(); - final byte[] newBases = Arrays.copyOf(oldBases,oldBases.length + extraBaseCount); - System.arraycopy(refBases,refBases.length - extraBaseCount,newBases,oldBases.length,extraBaseCount); - newAllele = Allele.create(newBases,false); - } else - newAllele = a; - result.add(newAllele); - finalAlleles.add(newAllele); - } else { // NO_CALL and strange miscellanea - result.add(a); - } - } - return result; - } - - /** - * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele - * - * @param alleles list of alleles to replace - * @return non-null list of alleles - */ - private static List replaceWithNoCalls(final List alleles) { - if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); - - final List result = new ArrayList<>(alleles.size()); - for ( final Allele allele : alleles ) - result.add(allele.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); - return result; - } - - /** - * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. - * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. - * - * @param mergedGenotypes the genotypes context to add to - * @param VC the Variant Context for the sample - * @param remappedAlleles the list of remapped alleles for the sample - * @param targetAlleles the list of target alleles - */ - private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, - final VariantContext VC, - final List remappedAlleles, - final List targetAlleles) { - final int maximumPloidy = VC.getMaxPloidy(GATKVariantContextUtils.DEFAULT_PLOIDY); - // the map is different depending on the ploidy, so in order to keep this method flexible (mixed ploidies) - // we need to get a map done (lazily inside the loop) for each ploidy, up to the maximum possible. - final int[][] genotypeIndexMapsByPloidy = new int[maximumPloidy + 1][]; - final int maximumAlleleCount = Math.max(remappedAlleles.size(),targetAlleles.size()); - final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles, VC.getStart()); - - for ( final Genotype g : VC.getGenotypes() ) { - final String name = g.getSampleName(); - if ( mergedGenotypes.containsSample(name) ) - continue; - final int ploidy = g.getPloidy(); - final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g).alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())); - if (g.hasPL()) { - // lazy initialization of the genotype index map by ploidy. - final int[] genotypeIndexMapByPloidy = genotypeIndexMapsByPloidy[ploidy] == null - ? GenotypeLikelihoodCalculators.getInstance(ploidy, maximumAlleleCount).genotypeIndexMap(indexesOfRelevantAlleles) - : genotypeIndexMapsByPloidy[ploidy]; - final int[] PLs = generatePL(g, genotypeIndexMapByPloidy); - final int[] AD = g.hasAD() ? generateAD(g.getAD(), indexesOfRelevantAlleles) : null; - genotypeBuilder.PL(PLs).AD(AD).noGQ(); - } - mergedGenotypes.add(genotypeBuilder.make()); - } - } - - /** - * Composes a new likelihood array given the original genotype and the genotype index map. - * - * @param g the original genotype. - * @param genotypeIndexMapByPloidy genotype index map. The ith element indicates what genotype in {@code g} corresponds - * to the ith genotype in the return likelihoods array. - * - * @throws NullPointerException if {@code g} or {@code genotypeIndexMapByPloidy} is {@code null}, or if {@code g} - * does not contain likelihoods. - * @throws IndexOutOfBoundsException if {@code genotypeIndexMapByPloidy} contain non valid - * genotype indices given the likelihood array in {@code g}. - * - * @return never {@code null} but an array of exactly {@code genotypeIndexMapByPloidy.length} positions. - */ - private static int[] generatePL(final Genotype g, final int[] genotypeIndexMapByPloidy) { - final int[] PLs = new int[genotypeIndexMapByPloidy.length]; - final int[] oldPLs = g.getPL(); - for (int i = 0; i < PLs.length; i++) - PLs[i] = oldPLs[genotypeIndexMapByPloidy[i]]; - return PLs; - } - - /** - * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. - * If the myAlleles set does not contain "" as an allele, it throws an exception. - * - * @param remappedAlleles the list of alleles to evaluate - * @param targetAlleles the target list of alleles - * @param position position to use for error messages - * @return non-null array of ints representing indexes - */ - protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles, final int position) { - - if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); - if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); - - if ( !remappedAlleles.contains(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE) ) - throw new UserException("The list of input alleles must contain " + GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE + " as an allele but that is not the case at position " + position + "; please use the Haplotype Caller with gVCF output to generate appropriate records"); - final int indexOfGenericAlt = remappedAlleles.indexOf(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - - final int[] indexMapping = new int[targetAlleles.size()]; - - // the reference alleles always match up (even if they don't appear to) - indexMapping[0] = 0; - - // create the index mapping, using the allele whenever such a mapping doesn't exist - for ( int i = 1; i < targetAlleles.size(); i++ ) { - final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); - indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt : indexOfRemappedAllele; - } - - return indexMapping; - } - - /** - * Generates a new AD array by adding zeros for missing alleles given the set of indexes of the Genotype's current - * alleles from the original AD. - * - * @param originalAD the original AD to extend - * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles - * @return non-null array of new AD values - */ - protected static int[] generateAD(final int[] originalAD, final int[] indexesOfRelevantAlleles) { - if ( originalAD == null || indexesOfRelevantAlleles == null ) throw new IllegalArgumentException("The list of input AD values and alleles must not be null"); - - final int numADs = indexesOfRelevantAlleles.length; - final int[] newAD = new int[numADs]; - - for ( int i = 0; i < numADs; i++ ) { - final int oldIndex = indexesOfRelevantAlleles[i]; - if ( oldIndex >= originalAD.length ) - newAD[i] = 0; - else - newAD[i] = originalAD[oldIndex]; - } - - return newAD; - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ContextCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ContextCovariateUnitTest.java new file mode 100644 index 000000000..25748f70e --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ContextCovariateUnitTest.java @@ -0,0 +1,121 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.ContextCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; +import org.broadinstitute.gatk.utils.clipping.ReadClipper; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ContextCovariateUnitTest { + ContextCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ContextCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleContexts() { + GATKSAMRecord read = ReadUtils.createRandomRead(1000); + GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + + verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); + } + + public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { + for (int i = 0; i < values.length; i++) + Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); + + } + + public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { + final String bases = stringFrom(read.getReadBases()); + String expectedContext = null; + if (offset - contextSize + 1 >= 0) { + String context = bases.substring(offset - contextSize + 1, offset + 1); + if (!context.contains("N")) + expectedContext = context; + } + return expectedContext; + } + + private static String stringFrom(byte[] array) { + String s = ""; + for (byte value : array) + s += (char) value; + return s; + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/CycleCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/CycleCovariateUnitTest.java new file mode 100644 index 000000000..f40152e94 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/CycleCovariateUnitTest.java @@ -0,0 +1,140 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.engine.recalibration.covariates.CycleCovariate; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class CycleCovariateUnitTest { + CycleCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new CycleCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSimpleCycles() { + short readLength = 10; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); + + read.setReadNegativeStrandFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); + + read.setSecondOfPairFlag(true); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); + + read.setReadNegativeStrandFlag(false); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); + } + + private void verifyCovariateArray(int[][] values, int init, int increment) { + for (short i = 0; i < values.length; i++) { + short actual = Short.decode(covariate.formatKey(values[i][0])); + int expected = init + (increment * i); + Assert.assertEquals(actual, expected); + } + } + + @Test(enabled = true, expectedExceptions={UserException.class}) + public void testMoreThanMaxCycleFails() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } + + @Test(enabled = true) + public void testMaxCyclePasses() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizerUnitTest.java new file mode 100644 index 000000000..b8d5c5303 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/QualQuantizerUnitTest.java @@ -0,0 +1,195 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class QualQuantizerUnitTest extends BaseTest { + @BeforeSuite + public void before() { + + } + + // -------------------------------------------------------------------------------- + // + // merge case Provider + // + // -------------------------------------------------------------------------------- + + private class QualIntervalTestProvider extends TestDataProvider { + final QualQuantizer.QualInterval left, right; + int exError, exTotal, exQual; + double exErrorRate; + + private QualIntervalTestProvider(int leftE, int leftN, int rightE, int rightN, int exError, int exTotal) { + super(QualIntervalTestProvider.class); + + QualQuantizer qq = new QualQuantizer(0); + left = qq.new QualInterval(10, 10, leftN, leftE, 0); + right = qq.new QualInterval(11, 11, rightN, rightE, 0); + + this.exError = exError; + this.exTotal = exTotal; + this.exErrorRate = (leftE + rightE + 1) / (1.0 * (leftN + rightN + 1)); + this.exQual = QualityUtils.errorProbToQual(this.exErrorRate); + } + } + + @DataProvider(name = "QualIntervalTestProvider") + public Object[][] makeQualIntervalTestProvider() { + new QualIntervalTestProvider(10, 100, 10, 1000, 20, 1100); + new QualIntervalTestProvider(0, 100, 10, 900, 10, 1000); + new QualIntervalTestProvider(10, 900, 0, 100, 10, 1000); + new QualIntervalTestProvider(0, 0, 10, 100, 10, 100); + new QualIntervalTestProvider(1, 10, 9, 90, 10, 100); + new QualIntervalTestProvider(1, 10, 9, 100000, 10, 100010); + new QualIntervalTestProvider(1, 10, 9, 1000000, 10,1000010); + + return QualIntervalTestProvider.getTests(QualIntervalTestProvider.class); + } + + @Test(dataProvider = "QualIntervalTestProvider") + public void testQualInterval(QualIntervalTestProvider cfg) { + QualQuantizer.QualInterval merged = cfg.left.merge(cfg.right); + Assert.assertEquals(merged.nErrors, cfg.exError); + Assert.assertEquals(merged.nObservations, cfg.exTotal); + Assert.assertEquals(merged.getErrorRate(), cfg.exErrorRate); + Assert.assertEquals(merged.getQual(), cfg.exQual); + } + + @Test + public void testMinInterestingQual() { + for ( int q = 0; q < 15; q++ ) { + for ( int minQual = 0; minQual <= 10; minQual ++ ) { + QualQuantizer qq = new QualQuantizer(minQual); + QualQuantizer.QualInterval left = qq.new QualInterval(q, q, 100, 10, 0); + QualQuantizer.QualInterval right = qq.new QualInterval(q+1, q+1, 1000, 100, 0); + + QualQuantizer.QualInterval merged = left.merge(right); + boolean shouldBeFree = q+1 <= minQual; + if ( shouldBeFree ) + Assert.assertEquals(merged.getPenalty(), 0.0); + else + Assert.assertTrue(merged.getPenalty() > 0.0); + } + } + } + + + // -------------------------------------------------------------------------------- + // + // High-level case Provider + // + // -------------------------------------------------------------------------------- + + private class QuantizerTestProvider extends TestDataProvider { + final List nObservationsPerQual = new ArrayList(); + final int nLevels; + final List expectedMap; + + private QuantizerTestProvider(final List nObservationsPerQual, final int nLevels, final List expectedMap) { + super(QuantizerTestProvider.class); + + for ( int x : nObservationsPerQual ) + this.nObservationsPerQual.add((long)x); + this.nLevels = nLevels; + this.expectedMap = expectedMap; + } + + @Override + public String toString() { + return String.format("QQTest nLevels=%d nObs=[%s] map=[%s]", + nLevels, Utils.join(",", nObservationsPerQual), Utils.join(",", expectedMap)); + } + } + + @DataProvider(name = "QuantizerTestProvider") + public Object[][] makeQuantizerTestProvider() { + List allQ2 = Arrays.asList(0, 0, 1000, 0, 0); + + new QuantizerTestProvider(allQ2, 5, Arrays.asList(0, 1, 2, 3, 4)); + new QuantizerTestProvider(allQ2, 1, Arrays.asList(2, 2, 2, 2, 2)); + + new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 0, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); + new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 1, 1000), 2, Arrays.asList(2, 2, 2, 4, 4)); + new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 10, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); + + return QuantizerTestProvider.getTests(QuantizerTestProvider.class); + } + + @Test(dataProvider = "QuantizerTestProvider", enabled = true) + public void testQuantizer(QuantizerTestProvider cfg) { + QualQuantizer qq = new QualQuantizer(cfg.nObservationsPerQual, cfg.nLevels, 0); + logger.warn("cfg: " + cfg); + for ( int i = 0; i < cfg.expectedMap.size(); i++) { + int expected = cfg.expectedMap.get(i); + int observed = qq.originalToQuantizedMap.get(i); + //logger.warn(String.format(" qq map: %s : %d => %d", i, expected, observed)); + Assert.assertEquals(observed, expected); + } + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariatesUnitTest.java new file mode 100644 index 000000000..f263345e7 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadCovariatesUnitTest.java @@ -0,0 +1,148 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.*; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.Random; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class ReadCovariatesUnitTest { + + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = false) + public void testCovariateGeneration() { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final String RGID = "id"; + + ReadGroupCovariate rgCov = new ReadGroupCovariate(); + QualityScoreCovariate qsCov = new QualityScoreCovariate(); + ContextCovariate coCov = new ContextCovariate(); + CycleCovariate cyCov = new CycleCovariate(); + + rgCov.initialize(RAC); + qsCov.initialize(RAC); + coCov.initialize(RAC); + cyCov.initialize(RAC); + + Covariate[] requestedCovariates = new Covariate[4]; + requestedCovariates[0] = rgCov; + requestedCovariates[1] = qsCov; + requestedCovariates[2] = coCov; + requestedCovariates[3] = cyCov; + + final int NUM_READS = 100; + final Random rnd = Utils.getRandomGenerator(); + + final String[] readGroups = {"RG1", "RG2", "RGbla"}; + for (int idx = 0; idx < NUM_READS; idx++) { + for (final String rgs : readGroups) { + final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); + rg.setPlatform("illumina"); + read.setReadGroup(rg); + read.setReadNegativeStrandFlag(rnd.nextBoolean()); + final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); + final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); + final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, length); + Assert.assertEquals(rc.getInsertionsKeySet().length, length); + Assert.assertEquals(rc.getDeletionsKeySet().length, length); + + for (int i = 0; i < length; i++) { + // check that read group is always the same + Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); + Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); + + // check quality score + Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); + Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); + + // check context + Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); + + // check cycle + Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); + Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); + } + + } + + } + + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadGroupCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadGroupCovariateUnitTest.java new file mode 100644 index 000000000..34548aee3 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/ReadGroupCovariateUnitTest.java @@ -0,0 +1,125 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.ReadGroupCovariate; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +/** + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ReadGroupCovariateUnitTest { + ReadGroupCovariate covariate; + RecalibrationArgumentCollection RAC; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ReadGroupCovariate(); + covariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + @Test(enabled = true) + public void testSingleRecord() { + final String expected = "SAMPLE.1"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); + rg.setPlatformUnit(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testMissingPlatformUnit() { + final String expected = "MY.7"; + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); + runTest(rg, expected, covariate); + } + + @Test(enabled = true) + public void testForceReadgroup() { + final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); + forcedRAC.FORCE_READGROUP = "FOO"; + final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); + forcedCovariate.initialize(forcedRAC); + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); + runTest(rg, "FOO", forcedCovariate); + } + + private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { + GATKSAMRecord read = ReadUtils.createRandomRead(10); + read.setReadGroup(rg); + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); + + } + + private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { + for (int[] value : values) { + String actual = covariate.formatKey(value[0]); + Assert.assertEquals(actual, expected); + } + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumUnitTest.java new file mode 100644 index 000000000..3c9048fae --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalDatumUnitTest.java @@ -0,0 +1,313 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; + + +public class RecalDatumUnitTest extends BaseTest { + + // -------------------------------------------------------------------------------- + // + // merge case Provider + // + // -------------------------------------------------------------------------------- + + private class RecalDatumTestProvider extends TestDataProvider { + int exError, exTotal, reportedQual; + + private RecalDatumTestProvider(int E, int N, int reportedQual) { + super(RecalDatumTestProvider.class); + + this.exError = E; + this.exTotal = N; + this.reportedQual = reportedQual; + } + + public double getErrorRate() { + return (exError + 1) / (1.0 * (exTotal + 2)); + } + + public double getErrorRatePhredScaled() { + return QualityUtils.phredScaleErrorRate(getErrorRate()); + } + + public int getReportedQual() { + return reportedQual; + } + + public RecalDatum makeRecalDatum() { + return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual()); + } + + @Override + public String toString() { + return String.format("exError=%d, exTotal=%d, reportedQual=%d", exError, exTotal, reportedQual); + } + } + + private static boolean createdDatumTestProviders = false; + + @DataProvider(name = "RecalDatumTestProvider") + public Object[][] makeRecalDatumTestProvider() { + if ( !createdDatumTestProviders ) { + for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) + for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) + for ( int reportedQual : Arrays.asList(10, 20) ) + if ( E <= N ) + new RecalDatumTestProvider(E, N, reportedQual); + createdDatumTestProviders = true; + } + + return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class); + } + + @Test(dataProvider = "RecalDatumTestProvider") + public void testRecalDatumBasics(RecalDatumTestProvider cfg) { + final RecalDatum datum = cfg.makeRecalDatum(); + assertBasicFeaturesOfRecalDatum(datum, cfg); + } + + private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) { + Assert.assertEquals(datum.getNumMismatches(), cfg.exError, 1E-6); + Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); + if ( cfg.getReportedQual() != -1 ) + Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); + assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate()); + + final double e = datum.getEmpiricalQuality(); + Assert.assertTrue(datum.getEmpiricalQualityAsByte() >= Math.floor(e)); + Assert.assertTrue(datum.getEmpiricalQualityAsByte() <= Math.ceil(e)); + Assert.assertNotNull(datum.toString()); + } + + @Test(dataProvider = "RecalDatumTestProvider") + public void testRecalDatumCopyAndCombine(RecalDatumTestProvider cfg) { + final RecalDatum datum = cfg.makeRecalDatum(); + final RecalDatum copy = new RecalDatum(datum); + assertBasicFeaturesOfRecalDatum(copy, cfg); + + RecalDatumTestProvider combinedCfg = new RecalDatumTestProvider(cfg.exError * 2, cfg.exTotal * 2, cfg.reportedQual); + copy.combine(datum); + assertBasicFeaturesOfRecalDatum(copy, combinedCfg); + } + + @Test(dataProvider = "RecalDatumTestProvider") + public void testRecalDatumModification(RecalDatumTestProvider cfg) { + RecalDatum datum = cfg.makeRecalDatum(); + datum.setEmpiricalQuality(10.1); + Assert.assertEquals(datum.getEmpiricalQuality(), 10.1); + + datum.setEstimatedQReported(10.1); + Assert.assertEquals(datum.getEstimatedQReported(), 10.1); + Assert.assertEquals(datum.getEstimatedQReportedAsByte(), 10); + + datum = cfg.makeRecalDatum(); + cfg.exTotal = 100000; + datum.setNumObservations(cfg.exTotal); + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + cfg.exError = 1000; + datum.setNumMismatches(cfg.exError); + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.increment(true); + cfg.exError++; + cfg.exTotal++; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.increment(false); + cfg.exTotal++; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.incrementNumObservations(2); + cfg.exTotal += 2; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.incrementNumMismatches(2); + cfg.exError += 2; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + + datum = cfg.makeRecalDatum(); + datum.increment(10, 5); + cfg.exError += 5; + cfg.exTotal += 10; + assertBasicFeaturesOfRecalDatum(datum, cfg); + } + + @Test + public void testNoObs() { + final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10); + Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0); + } + + @Test + public void testlog10QempPrior() { + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { + for ( int Qrep = 0; Qrep <= QualityUtils.MAX_SAM_QUAL_SCORE; Qrep++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + Assert.assertTrue(log10prior < 0.0); + Assert.assertFalse(Double.isInfinite(log10prior)); + Assert.assertFalse(Double.isNaN(log10prior)); + } + } + + final int Qrep = 20; + int maxQemp = -1; + double maxQempValue = -Double.MAX_VALUE; + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + if ( log10prior > maxQempValue ) { + maxQemp = Qemp; + maxQempValue = log10prior; + } + } + Assert.assertEquals(maxQemp, Qrep); + } + + @Test + public void testBayesianEstimateOfEmpiricalQuality() { + + final int Qrep = 20; + + // test no shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep); + + // test small shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0); + + // test medium shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0); + + // test large shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0); + } + + @Test + public void testlog10QempLikelihood() { + + final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 }; + final int[] observations = new int[] {0, 10, 1000, 1000000}; + final int[] errors = new int[] {0, 10, 1000, 1000000}; + + for ( double Qemp : Qemps ) { + for ( int observation : observations ) { + for ( int error : errors ) { + if ( error > observation ) + continue; + + final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error); + Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0); + Assert.assertFalse(Double.isInfinite(log10likelihood)); + Assert.assertFalse(Double.isNaN(log10likelihood)); + } + } + } + + long bigNum = new Long((long)Integer.MAX_VALUE); + bigNum *= 2L; + final double log10likelihood = RecalDatum.log10QempLikelihood(30, bigNum, 100000); + Assert.assertTrue(log10likelihood < 0.0); + Assert.assertFalse(Double.isInfinite(log10likelihood)); + Assert.assertFalse(Double.isNaN(log10likelihood)); + } + + @Test + public void basicHierarchicalBayesianQualityEstimateTest() { + + for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { + double RG_Q = 45.0; + RecalDatum RG = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + double Q = 30.0; + RecalDatum QS = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + + // initial epsilon condition shouldn't matter when there are a lot of observations + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + } + + for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { + double RG_Q = 45.0; + RecalDatum RG = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + double Q = 30.0; + RecalDatum QS = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + + // initial epsilon condition dominates when there is no data + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), epsilon, 1E-4 ); + } + + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalUtilsUnitTest.java new file mode 100644 index 000000000..0e95122da --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalUtilsUnitTest.java @@ -0,0 +1,178 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +public final class RecalUtilsUnitTest extends BaseTest { + private class Row { + int rg, qual, ne, no; + + private Row(final Row copy) { + this(copy.rg, copy.qual, copy.ne, copy.no); + } + + private Row(int rg, int qual, int ne, int no) { + this.rg = rg; + this.qual = qual; + this.ne = ne; + this.no = no; + } + + @Override + public String toString() { + return "Row{" + + "" + rg + + ", " + qual + + ", " + ne + + ", " + no + + '}'; + } + } + + @DataProvider(name = "CombineTablesProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + final List rows = new ArrayList(); + for ( final int rg : Arrays.asList(0, 1) ) { + for ( final int qual : Arrays.asList(0, 1) ) { + rows.add(new Row(rg, qual, 1, 10)); + } + } + + logger.warn("Number of rows " + rows.size()); + + List> permutations = new LinkedList>(); + permutations.addAll(Utils.makePermutations(rows, 1, false)); + permutations.addAll(Utils.makePermutations(rows, 2, false)); + permutations.addAll(Utils.makePermutations(rows, 3, false)); + + // adding 1 row to 2 + for ( final List table1 : permutations ) { + for ( final Row table2 : rows ) { + tests.add(new Object[]{table1, Arrays.asList(table2)}); + } + } + + // adding 2 rows to 1 + for ( final List table1 : permutations ) { + for ( final Row table2 : rows ) { + tests.add(new Object[]{Arrays.asList(table2), table1}); + } + } + + for ( final List table1 : permutations ) { + for ( final List table2 : permutations ) { + tests.add(new Object[]{table1, table2}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CombineTablesProvider") + public void testCombineTables(final List table1, final List table2) { + final NestedIntegerArray nia1 = makeTable(table1); + final NestedIntegerArray nia2 = makeTable(table2); + final List expectedRows = makeExpected(table1, table2); + final NestedIntegerArray expected = makeTable(expectedRows); + RecalUtils.combineTables(nia1, nia2); + + Assert.assertEquals(nia1.getDimensions(), expected.getDimensions()); + Assert.assertEquals(nia1.getAllValues().size(), expected.getAllValues().size()); + + for ( final NestedIntegerArray.Leaf leaf : expected.getAllLeaves() ) { + final RecalDatum actual = nia1.get(leaf.keys); + Assert.assertEquals(actual.getNumMismatches(), leaf.value.getNumMismatches()); + Assert.assertEquals(actual.getNumObservations(), leaf.value.getNumObservations()); + } + } + + public List makeExpected(final List table1, final List table2) { + final List combined = new LinkedList(); + for ( final Row t1 : table1 ) combined.add(new Row(t1)); + for ( final Row t2 : table2 ) { + combine(combined, t2); + } + return combined; + } + + private void combine(final List combined, final Row row) { + for ( final Row c : combined ) { + if ( c.rg == row.rg && c.qual == row.qual ) { + c.ne += row.ne; + c.no += row.no; + return; + } + } + + combined.add(new Row(row)); + } + + public NestedIntegerArray makeTable(final List rows) { + final NestedIntegerArray x = new NestedIntegerArray(3, 3); + for ( final Row r : rows ) + x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual); + return x; + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReportUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReportUnitTest.java new file mode 100644 index 000000000..d16f718be --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationReportUnitTest.java @@ -0,0 +1,176 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.*; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * @author carneiro + * @since 4/21/12 + */ +public class RecalibrationReportUnitTest { + @BeforeMethod + public void init() { + ReadCovariates.clearKeysCache(); + } + + private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + final Random random = new Random(); + final int nObservations = random.nextInt(maxObservations); + final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); + final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); + return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); + } + + @Test + public void testOutput() { + final int length = 100; + + List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); + + for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { + quals.add((byte) i); + counts.add(1L); + } + + final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + + quantizationInfo.noQuantization(); + final List requiredCovariates = new LinkedList(); + final List optionalCovariates = new LinkedList(); + + final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); + rgCovariate.initialize(RAC); + requiredCovariates.add(rgCovariate); + + final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); + qsCovariate.initialize(RAC); + requiredCovariates.add(qsCovariate); + + final ContextCovariate cxCovariate = new ContextCovariate(); + cxCovariate.initialize(RAC); + optionalCovariates.add(cxCovariate); + final CycleCovariate cyCovariate = new CycleCovariate(); + cyCovariate.initialize(RAC); + optionalCovariates.add(cyCovariate); + + final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; + int covariateIndex = 0; + for (final Covariate cov : requiredCovariates) + requestedCovariates[covariateIndex++] = cov; + for (final Covariate cov : optionalCovariates) + requestedCovariates[covariateIndex++] = cov; + + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); + rg.setPlatform("illumina"); + final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); + read.setReadGroup(rg); + final byte [] readQuals = new byte[length]; + for (int i = 0; i < length; i++) + readQuals[i] = 20; + read.setBaseQualities(readQuals); + + final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); + int nKeys = 0; // keep track of how many keys were produced + final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); + final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + + for (int offset = 0; offset < length; offset++) { + + for (EventType errorMode : EventType.values()) { + + final int[] covariates = rc.getKeySet(offset, errorMode); + final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; + + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); + nKeys += 2; + for (int j = 0; j < optionalCovariates.size(); j++) { + final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); + final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; + if ( covValue >= 0 ) { + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); + nKeys++; + } + } + } + } + Assert.assertEquals(nKeys, expectedKeys); + } + + private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { + final int numCovariates = 4; + final int numTables = 3; + final int mismatchContextPadding = mismatchesContextSize - 1; + final int indelContextPadding = 2 * (indelContextSize - 1); + final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); + + return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTablesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTablesUnitTest.java new file mode 100644 index 000000000..f40ef2602 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTablesUnitTest.java @@ -0,0 +1,203 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; +import org.broadinstitute.gatk.utils.recalibration.EventType; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.List; + +public final class RecalibrationTablesUnitTest extends BaseTest { + private RecalibrationTables tables; + private Covariate[] covariates; + private int numReadGroups = 6; + final byte qualByte = 1; + final List combineStates = Arrays.asList(0, 1, 2); + + @BeforeMethod + private void makeTables() { + covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); + tables = new RecalibrationTables(covariates, numReadGroups); + fillTable(tables); + } + + private void fillTable(final RecalibrationTables tables) { + for ( int iterations = 0; iterations < 10; iterations++ ) { + for ( final EventType et : EventType.values() ) { + for ( final int rg : combineStates) { + final double error = rg % 2 == 0 ? 1 : 0; + RecalUtils.incrementDatumOrPutIfNecessary(tables.getReadGroupTable(), qualByte, error, rg, et.ordinal()); + for ( final int qual : combineStates) { + RecalUtils.incrementDatumOrPutIfNecessary(tables.getQualityScoreTable(), qualByte, error, rg, qual, et.ordinal()); + for ( final int cycle : combineStates) + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(2), qualByte, error, rg, qual, cycle, et.ordinal()); + for ( final int context : combineStates) + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(3), qualByte, error, rg, qual, context, et.ordinal()); + } + } + } + } + } + + @Test + public void basicTest() { + final Covariate qualCov = covariates[1]; + final Covariate cycleCov = covariates[2]; + final Covariate contextCov = covariates[3]; + + Assert.assertEquals(tables.numTables(), covariates.length); + + Assert.assertNotNull(tables.getReadGroupTable()); + Assert.assertEquals(tables.getReadGroupTable(), tables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal())); + testDimensions(tables.getReadGroupTable(), numReadGroups); + + Assert.assertNotNull(tables.getQualityScoreTable()); + Assert.assertEquals(tables.getQualityScoreTable(), tables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal())); + testDimensions(tables.getQualityScoreTable(), numReadGroups, qualCov.maximumKeyValue() + 1); + + Assert.assertNotNull(tables.getTable(2)); + testDimensions(tables.getTable(2), numReadGroups, qualCov.maximumKeyValue() + 1, cycleCov.maximumKeyValue() + 1); + + Assert.assertNotNull(tables.getTable(3)); + testDimensions(tables.getTable(3), numReadGroups, qualCov.maximumKeyValue() + 1, contextCov.maximumKeyValue() + 1); + } + + private void testDimensions(final NestedIntegerArray table, final int ... dimensions) { + final int[] dim = new int[dimensions.length+1]; + System.arraycopy(dimensions, 0, dim, 0, dimensions.length); + dim[dimensions.length] = EventType.values().length; + Assert.assertEquals(table.getDimensions().length, dim.length); + + for ( int i = 0; i < dim.length; i++ ) { + Assert.assertEquals(table.getDimensions()[i], dim[i], "Table dimensions not expected at dim " + i); + } + } + + @Test + public void basicMakeQualityScoreTable() { + final Covariate qualCov = covariates[1]; + final NestedIntegerArray copy = tables.makeQualityScoreTable(); + testDimensions(copy, numReadGroups, qualCov.maximumKeyValue()+1); + Assert.assertEquals(copy.getAllValues().size(), 0); + } + + @Test + public void testCombine1() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + fillTable(merged); + + merged.combine(tables); + + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() * 2); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() * 2); + } + } + } + + @Test + public void testCombineEmptyOther() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + + merged.combine(tables); + + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations()); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches()); + } + } + } + + @Test + public void testCombinePartial() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + for ( final int rg : combineStates) { + RecalUtils.incrementDatumOrPutIfNecessary(merged.getTable(3), qualByte, 1, rg, 0, 0, 0); + } + + merged.combine(tables); + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + + final int delta = i == 3 && leaf.keys[1] == 0 && leaf.keys[2] == 0 && leaf.keys[3] == 0 ? 1 : 0; + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() + delta); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() + delta); + } + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTestUtils.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTestUtils.java new file mode 100644 index 000000000..ce374b047 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RecalibrationTestUtils.java @@ -0,0 +1,74 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.recalibration.covariates.*; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 12/23/12 + * Time: 1:06 PM + * To change this template use File | Settings | File Templates. + */ +public class RecalibrationTestUtils { + public static Covariate[] makeInitializedStandardCovariates() { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final Covariate[] covariates = new Covariate[4]; + covariates[0] = new ReadGroupCovariate(); + covariates[1] = new QualityScoreCovariate(); + covariates[2] = new ContextCovariate(); + covariates[3] = new CycleCovariate(); + for ( Covariate cov : covariates ) cov.initialize(RAC); + return covariates; + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RepeatCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RepeatCovariatesUnitTest.java new file mode 100644 index 000000000..66c12a55a --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/recalibration/RepeatCovariatesUnitTest.java @@ -0,0 +1,252 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import com.google.java.contract.Requires; +import org.broadinstitute.gatk.engine.recalibration.covariates.Covariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatUnitAndLengthCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatUnitCovariate; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Random; + +public class RepeatCovariatesUnitTest { + + RepeatLengthCovariate rlCovariate; + RepeatUnitCovariate ruCovariate; + RepeatUnitAndLengthCovariate rurlCovariate; + RecalibrationArgumentCollection RAC; + + + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + rlCovariate = new RepeatLengthCovariate(); + ruCovariate = new RepeatUnitCovariate(); + rurlCovariate = new RepeatUnitAndLengthCovariate(); + rlCovariate.initialize(RAC); + ruCovariate.initialize(RAC); + rurlCovariate.initialize(RAC); + } + + @BeforeMethod + public void initCache() { + ReadCovariates.clearKeysCache(); + } + + + @Test + public void testFindNumberOfRepetitions() { + // First, test logic to compute number of repetitions of a substring on a given string. + int result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), true); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); + Assert.assertEquals(1,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); + Assert.assertEquals(0,result); + // Same tests but looking backward on string + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), false); + Assert.assertEquals(2,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); + Assert.assertEquals(4,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); + Assert.assertEquals(0,result); + result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); + Assert.assertEquals(3,result); + + // test logic to get repeat unit and number of repeats from covariate value + final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; + for (String ru : repUnits) { + for (int k=1; k < 10; k++) { + Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); + Assert.assertEquals(pair.second.intValue(),k); + Assert.assertEquals(pair.first,ru); + } + } + + } + + /** + * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if + * they match with read context + */ + @Test + public void testManyObservations() { + final int NUM_UNITS = 10; + final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; + final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; + final int NUM_TEST_CASES = 100; + + Random random = new Random(); + + for (int r = 0; r < NUM_TEST_CASES; r++) { + final StringBuilder sb = new StringBuilder(); + // for each unit, generate a repeat unit at random with given random length + final ArrayList repeatUnits = new ArrayList(); + final ArrayList numsRepetitions = new ArrayList(); + for (int n=0; n < NUM_UNITS; n++) { + final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); + final String repeatUnit = getRandomBases(repLength); + final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); + + // log for comparison with covariate + numsRepetitions.add(numRepetitions); + repeatUnits.add(repeatUnit); + + for (int k=0; k < numRepetitions; k++) + sb.append(repeatUnit); + + } + + final String readBases = sb.toString(); + System.out.println(readBases); + final int readLength = readBases.length(); + + final byte[] readQuals = new byte[readLength]; + Arrays.fill(readQuals,(byte)30); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); + + Covariate[] requestedCovariates = new Covariate[3]; + requestedCovariates[0] = rlCovariate; + requestedCovariates[1] = ruCovariate; + requestedCovariates[2] = rurlCovariate; + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); + + // check that the length is correct + Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); + Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); + Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); + + for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read + // check RepeatLength + final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); + final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); + final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); + // check RepeatUnit + final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); + final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); + final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); + // check RepeatUnitAndLength + final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); + final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); + final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); + // check all 3 values are identical + Assert.assertEquals(rlValD,rlValI); + Assert.assertEquals(rlValM,rlValI); + Assert.assertEquals(ruValD,ruValI); + Assert.assertEquals(ruValM,ruValI); + Assert.assertEquals(rurlValD,rurlValI); + Assert.assertEquals(rurlValM,rurlValI); + + + int fw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(offset + 1, readLength).getBytes(), true); + int bw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(0, offset + 1).getBytes(), false); + Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); + } + + } + + + + + + + } + + /** + * Returns random bases of given length + * @param length required length + * @return given random string + */ + @Requires("length > 0") + String getRandomBases(final int length) { + byte[] bases = new byte[length]; + Random ran = new Random(); + for (int i=0; i < length; i++ ) { + int idx = ran.nextInt(4); + bases[i] = BaseUtils.baseIndexToSimpleBase(idx); + } + return new String(bases); + } + + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngineUnitTest.java deleted file mode 100644 index 4922e69d6..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngineUnitTest.java +++ /dev/null @@ -1,259 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffEngine; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.Difference; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DiffEngineUnitTest extends BaseTest { - DiffEngine engine; - - @BeforeClass(enabled = true) - public void createDiffEngine() { - engine = new DiffEngine(); - } - - // -------------------------------------------------------------------------------- - // - // Difference testing routines - // - // -------------------------------------------------------------------------------- - - private class DifferenceTest extends TestDataProvider { - public DiffElement tree1, tree2; - public List differences; - - private DifferenceTest(String tree1, String tree2) { - this(tree1, tree2, Collections.emptyList()); - } - - private DifferenceTest(String tree1, String tree2, String difference) { - this(tree1, tree2, Arrays.asList(difference)); - } - - private DifferenceTest(String tree1, String tree2, List differences) { - super(DifferenceTest.class); - this.tree1 = DiffNode.fromString(tree1); - this.tree2 = DiffNode.fromString(tree2); - this.differences = differences; - } - - public String toString() { - return String.format("tree1=%s tree2=%s diff=%s", - tree1.toOneLineString(), tree2.toOneLineString(), differences); - } - } - - @DataProvider(name = "trees") - public Object[][] createTrees() { - new DifferenceTest("A=X", "A=X"); - new DifferenceTest("A=X", "A=Y", "A:X!=Y"); - new DifferenceTest("A=X", "B=X", Arrays.asList("A:X!=MISSING", "B:MISSING!=X")); - new DifferenceTest("A=(X=1)", "B=(X=1)", Arrays.asList("A:(X=1)!=MISSING", "B:MISSING!=(X=1)")); - new DifferenceTest("A=(X=1)", "A=(X=1)"); - new DifferenceTest("A=(X=1 Y=2)", "A=(X=1 Y=2)"); - new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=3))"); - new DifferenceTest("A=(X=1)", "A=(X=2)", "A.X:1!=2"); - new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=4))", "A.B.Z:3!=4"); - new DifferenceTest("A=(X=1)", "A=(X=1 Y=2)", "A.Y:MISSING!=2"); - new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2)", "A.B:(Z=3)!=MISSING"); - return DifferenceTest.getTests(DifferenceTest.class); - } - - @Test(enabled = true, dataProvider = "trees") - public void testDiffs(DifferenceTest test) { - logger.warn("Test tree1: " + test.tree1.toOneLineString()); - logger.warn("Test tree2: " + test.tree2.toOneLineString()); - - List diffs = engine.diff(test.tree1, test.tree2); - logger.warn("Test expected diff : " + test.differences); - logger.warn("Observed diffs : " + diffs); - } - - // -------------------------------------------------------------------------------- - // - // Low-level routines for summarizing differences - // - // -------------------------------------------------------------------------------- - - @Test(enabled = true) - public void testLongestCommonPostfix() { - testLongestCommonPostfixHelper("A", "A", 1); - testLongestCommonPostfixHelper("A", "B", 0); - testLongestCommonPostfixHelper("A.B", "A.B", 2); - testLongestCommonPostfixHelper("A.B.C", "A.B.C", 3); - testLongestCommonPostfixHelper("A.B.C", "X.B.C", 2); - testLongestCommonPostfixHelper("A.B.C", "X.Y.C", 1); - testLongestCommonPostfixHelper("A.B.C", "X.Y.Z", 0); - testLongestCommonPostfixHelper("A.B.C", "A.X.C", 1); - testLongestCommonPostfixHelper("A.B.C", "A.X.Z", 0); - testLongestCommonPostfixHelper("A.B.C", "A.B.Z", 0); - } - - public void testLongestCommonPostfixHelper(String p1, String p2, int expected) { - String[] parts1 = p1.split("\\."); - String[] parts2 = p2.split("\\."); - int obs = DiffEngine.longestCommonPostfix(parts1, parts2); - Assert.assertEquals(obs, expected, "p1=" + p1 + " p2=" + p2 + " failed"); - } - - @Test(enabled = true, dependsOnMethods = "testLongestCommonPostfix") - public void testSummarizePath() { - testSummarizePathHelper("A", "A", "A"); - testSummarizePathHelper("A", "B", "*"); - testSummarizePathHelper("A.B", "A.B", "A.B"); - testSummarizePathHelper("A.B", "X.B", "*.B"); - testSummarizePathHelper("A.B", "X.Y", "*.*"); - testSummarizePathHelper("A.B.C", "A.B.C", "A.B.C"); - testSummarizePathHelper("A.B.C", "X.B.C", "*.B.C"); - testSummarizePathHelper("A.B.C", "X.Y.C", "*.*.C"); - testSummarizePathHelper("A.B.C", "X.Y.Z", "*.*.*"); - testSummarizePathHelper("A.B.C", "A.X.C", "*.*.C"); - testSummarizePathHelper("A.B.C", "A.X.Z", "*.*.*"); - testSummarizePathHelper("A.B.C", "A.B.Z", "*.*.*"); - } - - public void testSummarizePathHelper(String p1, String p2, String expected) { - String[] parts1 = DiffEngine.diffNameToPath(p1); - String[] parts2 = DiffEngine.diffNameToPath(p2); - int obs = DiffEngine.longestCommonPostfix(parts1, parts2); - String path = DiffEngine.summarizedPath(parts2, obs); - Assert.assertEquals(path, expected, "p1=" + p1 + " p2=" + p2 + " failed"); - } - - // -------------------------------------------------------------------------------- - // - // High-level difference summary - // - // -------------------------------------------------------------------------------- - - private class SummarizeDifferenceTest extends TestDataProvider { - List diffs = new ArrayList(); - List expecteds = new ArrayList(); - - public SummarizeDifferenceTest() { super(SummarizeDifferenceTest.class); } - - public SummarizeDifferenceTest addDiff(String... diffsToAdd) { - diffs.addAll(Arrays.asList(diffsToAdd)); - return this; - } - - public SummarizeDifferenceTest addSummary(String... expectedSummary) { - expecteds.addAll(Arrays.asList(expectedSummary)); - return this; - } - - public String toString() { - return String.format("diffs=%s => expected=%s", diffs, expecteds); - } - - public void test() { - List diffPaths = new ArrayList(diffs.size()); - for ( String diff : diffs ) { diffPaths.add(DiffEngine.diffNameToPath(diff)); } - - List sumDiffs = engine.summarizedDifferencesOfPathsFromString(diffs); - - Assert.assertEquals(sumDiffs.size(), expecteds.size(), "Unexpected number of summarized differences: " + sumDiffs); - - for ( int i = 0; i < sumDiffs.size(); i++ ) { - Difference sumDiff = sumDiffs.get(i); - String expected = expecteds.get(i); - String[] pathCount = expected.split(":"); - String path = pathCount[0]; - int count = Integer.valueOf(pathCount[1]); - Assert.assertEquals(sumDiff.getPath(), path, "Unexpected path at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); - Assert.assertEquals(sumDiff.getCount(), count, "Unexpected counts at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); - } - } - } - - @DataProvider(name = "summaries") - public Object[][] createSummaries() { - new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); - new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); - new SummarizeDifferenceTest().addDiff("A", "A", "A").addSummary("A:3"); - new SummarizeDifferenceTest().addDiff("A", "A", "A", "B").addSummary("A:3", "B:1"); - new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B").addSummary("A:3", "B:2"); - new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B", "C").addSummary("A:3", "B:2", "C:1"); - new SummarizeDifferenceTest().addDiff("A.X", "A.X").addSummary("A.X:2"); - new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X").addSummary("*.X:3", "A.X:2", "B.X:1"); - new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X", "B.X").addSummary("*.X:4", "A.X:2", "B.X:2"); - new SummarizeDifferenceTest().addDiff("A.B.C", "X.B.C").addSummary("*.B.C:2", "A.B.C:1", "X.B.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "X.Y.C", "X.Y.C").addSummary("*.*.C:3", "X.Y.C:2", "A.B.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "X.Y.C").addSummary("*.*.C:3", "A.B.C:1", "A.X.C:1", "X.Y.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C").addSummary("*.*.C:3", "*.X.C:2", "A.B.C:1", "A.X.C:1", "B.X.C:1"); - new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C", "B.X.C").addSummary("*.*.C:4", "*.X.C:3", "B.X.C:2", "A.B.C:1", "A.X.C:1"); - - return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); - } - - - @Test(enabled = true, dependsOnMethods = "testSummarizePath", dataProvider = "summaries") - public void testSummarizeDifferences(SummarizeDifferenceTest test) { - test.test(); - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNodeUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNodeUnitTest.java deleted file mode 100644 index 388ba518f..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNodeUnitTest.java +++ /dev/null @@ -1,278 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffValue; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DiffNodeUnitTest extends BaseTest { - // Data is: - // MY_ROOT - // fields: A=A, B=B - // nodes: C, D - // C: fields: E=E, nodes: none - // D: fields: F=F, G=G, nodes: none - static DiffNode MY_ROOT = DiffNode.rooted("MY_ROOT"); - static DiffValue Value_A = new DiffValue("A", MY_ROOT, "A"); - static DiffValue Value_B = new DiffValue("B", MY_ROOT, "B"); - static DiffNode NODE_C = DiffNode.empty("C", MY_ROOT); - static DiffNode NODE_D = DiffNode.empty("D", MY_ROOT); - static DiffValue Value_E = new DiffValue("E", NODE_C, "E"); - static DiffValue Value_F = new DiffValue("F", NODE_D, "F"); - static DiffValue Value_G = new DiffValue("G", NODE_D, "G"); - - static { - MY_ROOT.add(Value_A); - MY_ROOT.add(Value_B); - MY_ROOT.add(NODE_C); - MY_ROOT.add(NODE_D); - NODE_C.add(Value_E); - NODE_D.add(Value_F); - NODE_D.add(Value_G); - } - - - // -------------------------------------------------------------------------------- - // - // Element testing routines - // - // -------------------------------------------------------------------------------- - - private class ElementTest extends TestDataProvider { - public DiffElement elt; - public String name; - public String fullName; - public DiffElement parent; - - private ElementTest(DiffValue elt, DiffValue parent, String name, String fullName) { - this(elt.getBinding(), parent.getBinding(), name, fullName); - } - - private ElementTest(DiffElement elt, DiffElement parent, String name, String fullName) { - super(ElementTest.class); - this.elt = elt; - this.name = name; - this.fullName = fullName; - this.parent = parent; - } - - public String toString() { - return String.format("ElementTest elt=%s name=%s fullName=%s parent=%s", - elt.toOneLineString(), name, fullName, parent.getName()); - } - } - - @DataProvider(name = "elementdata") - public Object[][] createElementData() { - new ElementTest(MY_ROOT.getBinding(), DiffElement.ROOT, "MY_ROOT", "MY_ROOT"); - new ElementTest(NODE_C, MY_ROOT, "C", "MY_ROOT.C"); - new ElementTest(NODE_D, MY_ROOT, "D", "MY_ROOT.D"); - new ElementTest(Value_A, MY_ROOT, "A", "MY_ROOT.A"); - new ElementTest(Value_B, MY_ROOT, "B", "MY_ROOT.B"); - new ElementTest(Value_E, NODE_C, "E", "MY_ROOT.C.E"); - new ElementTest(Value_F, NODE_D, "F", "MY_ROOT.D.F"); - new ElementTest(Value_G, NODE_D, "G", "MY_ROOT.D.G"); - return TestDataProvider.getTests(ElementTest.class); - } - - @Test(enabled = true, dataProvider = "elementdata") - public void testElementMethods(ElementTest test) { - Assert.assertNotNull(test.elt.getName()); - Assert.assertNotNull(test.elt.getParent()); - Assert.assertEquals(test.elt.getName(), test.name); - Assert.assertEquals(test.elt.getParent(), test.parent); - Assert.assertEquals(test.elt.fullyQualifiedName(), test.fullName); - } - - // -------------------------------------------------------------------------------- - // - // DiffValue testing routines - // - // -------------------------------------------------------------------------------- - - private class LeafTest extends TestDataProvider { - public DiffValue diffvalue; - public Object value; - - private LeafTest(DiffValue diffvalue, Object value) { - super(LeafTest.class); - this.diffvalue = diffvalue; - this.value = value; - } - - public String toString() { - return String.format("LeafTest diffvalue=%s value=%s", diffvalue.toOneLineString(), value); - } - } - - @DataProvider(name = "leafdata") - public Object[][] createLeafData() { - new LeafTest(Value_A, "A"); - new LeafTest(Value_B, "B"); - new LeafTest(Value_E, "E"); - new LeafTest(Value_F, "F"); - new LeafTest(Value_G, "G"); - return TestDataProvider.getTests(LeafTest.class); - } - - @Test(enabled = true, dataProvider = "leafdata") - public void testLeafMethods(LeafTest test) { - Assert.assertNotNull(test.diffvalue.getValue()); - Assert.assertEquals(test.diffvalue.getValue(), test.value); - } - - // -------------------------------------------------------------------------------- - // - // Node testing routines - // - // -------------------------------------------------------------------------------- - - private class NodeTest extends TestDataProvider { - public DiffNode node; - public Set fields; - public Set subnodes; - public Set allNames; - - private NodeTest(DiffNode node, List fields, List subnodes) { - super(NodeTest.class); - this.node = node; - this.fields = new HashSet(fields); - this.subnodes = new HashSet(subnodes); - this.allNames = new HashSet(fields); - allNames.addAll(subnodes); - } - - public String toString() { - return String.format("NodeTest node=%s fields=%s subnodes=%s", - node.toOneLineString(), fields, subnodes); - } - } - - @DataProvider(name = "nodedata") - public Object[][] createData1() { - new NodeTest(MY_ROOT, Arrays.asList("A", "B"), Arrays.asList("C", "D")); - new NodeTest(NODE_C, Arrays.asList("E"), Collections.emptyList()); - new NodeTest(NODE_D, Arrays.asList("F", "G"), Collections.emptyList()); - return TestDataProvider.getTests(NodeTest.class); - } - - @Test(enabled = true, dataProvider = "nodedata") - public void testNodeAccessors(NodeTest test) { - Assert.assertNotNull(test.node.getElements()); - - for ( String name : test.allNames ) { - DiffElement elt = test.node.getElement(name); - Assert.assertNotNull(elt, "Failed to find field " + elt + " in " + test.node); - Assert.assertEquals(elt.getName(), name); - Assert.assertEquals(elt.getValue().isAtomic(), test.fields.contains(name), "Failed atomic/compound expectation: " + test.node); - } - } - - // NOTE: add routines are being implicitly tested by the creation of the data structures - - @Test(enabled = true, dataProvider = "nodedata") - public void testCounts(NodeTest test) { - Assert.assertEquals(test.node.getElements().size(), test.allNames.size()); - Assert.assertEquals(test.node.getElementNames(), test.allNames); - } - - // -------------------------------------------------------------------------------- - // - // fromString testing routines - // - // -------------------------------------------------------------------------------- - - private class FromStringTest extends TestDataProvider { - public String string; - public DiffElement expected; - - private FromStringTest(String string, DiffElement expected) { - super(FromStringTest.class); - this.string = string; - this.expected = expected; - } - - public String toString() { - return String.format("FromStringTest string=%s expected=%s", string, expected.toOneLineString()); - } - } - - @DataProvider(name = "fromstringdata") - public Object[][] createFromData() { - new FromStringTest("A=A", Value_A.getBinding()); - new FromStringTest("B=B", Value_B.getBinding()); - new FromStringTest("C=(E=E)", NODE_C.getBinding()); - new FromStringTest("D=(F=F G=G)", NODE_D.getBinding()); - return TestDataProvider.getTests(FromStringTest.class); - } - - @Test(enabled = true, dataProvider = "fromstringdata") - public void parseFromString(FromStringTest test) { - logger.warn("Testing from string: " + test.string); - DiffElement elt = DiffNode.fromString(test.string); - Assert.assertEquals(elt.toOneLineString(), test.expected.toOneLineString()); - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjectsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjectsIntegrationTest.java deleted file mode 100644 index c3108f055..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjectsIntegrationTest.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.BaseTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; - -public class DiffObjectsIntegrationTest extends WalkerTest { - private class TestParams extends TestDataProvider { - public File master, test; - public String MD5; - public boolean doPairwise; - - private TestParams(String master, String test, final boolean doPairwise, String MD5) { - super(TestParams.class); - this.master = new File(master); - this.test = new File(test); - this.MD5 = MD5; - this.doPairwise = doPairwise; - } - - public String toString() { - return String.format("master=%s,test=%s,md5=%s", master, test, MD5); - } - } - - @DataProvider(name = "data") - public Object[][] createData() { - new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", true, "71869ddf9665773a842a9def4cc5f3c8"); - new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", true, "cec7c644c84ef9c96aacaed604d9ec9b"); - new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", false, "47546e03344103020e49d8037a7e0727"); - new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", false, "d27b37f7a366c8dacca5cd2590d3c6ce"); - return TestParams.getTests(TestParams.class); - } - - @Test(enabled = true, dataProvider = "data") - public void testDiffs(TestParams params) { - WalkerTestSpec spec = new WalkerTestSpec( - "-T DiffObjects -R " + publicTestDir + "exampleFASTA.fasta " - + " -m " + params.master - + " -t " + params.test - + (params.doPairwise ? " -doPairwise " : "") - + " -o %s", - Arrays.asList(params.MD5)); - executeTest("testDiffObjects:"+params, spec).getFirst(); - } -} - diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReaderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReaderUnitTest.java deleted file mode 100644 index 26b786022..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReaderUnitTest.java +++ /dev/null @@ -1,173 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffEngine; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffableReader; -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.variantcontext.Allele; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DiffableReaderUnitTest extends BaseTest { - DiffEngine engine; - - File vcfFile = new File(privateTestDir + "diffTestMaster.vcf"); - File bamFile = new File(publicTestDir + "exampleBAM.bam"); - - @BeforeClass(enabled = true) - public void createDiffEngine() { - engine = new DiffEngine(); - } - - @Test(enabled = true) - public void testPluggableDiffableReaders() { - logger.warn("testPluggableDiffableReaders"); - Map readers = engine.getReaders(); - Assert.assertNotNull(readers); - Assert.assertTrue(readers.size() > 0); - Assert.assertNotNull(readers.get("VCF")); - for ( Map.Entry e : engine.getReaders().entrySet() ) { - logger.warn("Found diffable reader: " + e.getKey()); - Assert.assertEquals(e.getValue().getName(), e.getKey()); - Assert.assertEquals(e.getValue(), engine.getReader(e.getKey())); - } - } - - private static void testLeaf(DiffNode rec, String field, Object expected) { - DiffElement value = rec.getElement(field); - Assert.assertNotNull(value, "Expected to see leaf named " + field + " in rec " + rec); - Assert.assertEquals(value.getValue().getValue(), expected, "Expected to see leaf named " + field + " to have value " + expected + " in rec " + rec + " but got instead " + value.getValue().getValue()); - } - - @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") - public void testVCF1() { - logger.warn("testVCF1"); - DiffableReader vcfReader = engine.getReader("VCF"); - Assert.assertTrue(vcfReader.canRead(vcfFile)); - Assert.assertFalse(vcfReader.canRead(bamFile)); - - DiffElement diff = vcfReader.readFromFile(vcfFile, -1); - Assert.assertNotNull(diff); - - Assert.assertEquals(diff.getName(), vcfFile.getName()); - Assert.assertSame(diff.getParent(), DiffElement.ROOT); - - DiffNode node = diff.getValueAsNode(); - Assert.assertEquals(node.getElements().size(), 11); - - // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 - DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); - testLeaf(rec1, "CHROM", "chr1"); - testLeaf(rec1, "POS", 2646); - testLeaf(rec1, "ID", "rs62635284"); - testLeaf(rec1, "REF", Allele.create("G", true)); - testLeaf(rec1, "ALT", Arrays.asList(Allele.create("A"))); - testLeaf(rec1, "QUAL", 0.15); - testLeaf(rec1, "FILTER", VCFConstants.PASSES_FILTERS_v4); - testLeaf(rec1, "AC", "2"); - testLeaf(rec1, "AF", "1.00"); - testLeaf(rec1, "AN", "2"); - } - - @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") - public void testBAM() { - logger.warn("testBAM"); - DiffableReader bamReader = engine.getReader("BAM"); - Assert.assertTrue(bamReader.canRead(bamFile)); - Assert.assertFalse(bamReader.canRead(vcfFile)); - - DiffElement diff = bamReader.readFromFile(bamFile, -1); - Assert.assertNotNull(diff); - - Assert.assertEquals(diff.getName(), bamFile.getName()); - Assert.assertSame(diff.getParent(), DiffElement.ROOT); - - DiffNode node = diff.getValueAsNode(); - Assert.assertEquals(node.getElements().size(), 33); - - // 30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = - // 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC - // BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: - // PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam - - DiffNode rec1 = node.getElement("30PPJAAXX090125:1:42:512:1817#0_1").getValueAsNode(); - testLeaf(rec1, "NAME", "30PPJAAXX090125:1:42:512:1817#0"); - testLeaf(rec1, "FLAGS", 99); - testLeaf(rec1, "RNAME", "chr1"); - testLeaf(rec1, "POS", 200); - testLeaf(rec1, "MAPQ", 0); - testLeaf(rec1, "CIGAR", "76M"); - testLeaf(rec1, "RNEXT", "chr1"); - testLeaf(rec1, "PNEXT", 255); - testLeaf(rec1, "TLEN", -130); - testLeaf(rec1, "SEQ", "ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC"); - testLeaf(rec1, "QUAL", "BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3:"); - testLeaf(rec1, "PG", "0"); - testLeaf(rec1, "RG", "exampleBAM.bam"); - testLeaf(rec1, "SM", "exampleBAM.bam"); - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DifferenceUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DifferenceUnitTest.java deleted file mode 100644 index 685514f34..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/engine/walkers/diffengine/DifferenceUnitTest.java +++ /dev/null @@ -1,118 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffElement; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffNode; -import org.broadinstitute.gatk.engine.walkers.diffengine.Difference; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -/** - * Basic unit test for DifferableReaders in reduced reads - */ -public class DifferenceUnitTest extends BaseTest { - // -------------------------------------------------------------------------------- - // - // testing routines - // - // -------------------------------------------------------------------------------- - - private class DifferenceTest extends TestDataProvider { - public DiffElement tree1, tree2; - public String difference; - - private DifferenceTest(String tree1, String tree2, String difference) { - this(DiffNode.fromString(tree1), DiffNode.fromString(tree2), difference); - } - - private DifferenceTest(DiffElement tree1, DiffElement tree2, String difference) { - super(DifferenceTest.class); - this.tree1 = tree1; - this.tree2 = tree2; - this.difference = difference; - } - - public String toString() { - return String.format("tree1=%s tree2=%s diff=%s", - tree1 == null ? "null" : tree1.toOneLineString(), - tree2 == null ? "null" : tree2.toOneLineString(), - difference); - } - } - - @DataProvider(name = "data") - public Object[][] createTrees() { - new DifferenceTest("A=X", "A=Y", "A:1:X!=Y"); - new DifferenceTest("A=Y", "A=X", "A:1:Y!=X"); - new DifferenceTest(DiffNode.fromString("A=X"), null, "A:1:X!=MISSING"); - new DifferenceTest(null, DiffNode.fromString("A=X"), "A:1:MISSING!=X"); - return DifferenceTest.getTests(DifferenceTest.class); - } - - @Test(enabled = true, dataProvider = "data") - public void testDiffToString(DifferenceTest test) { - logger.warn("Test tree1: " + (test.tree1 == null ? "null" : test.tree1.toOneLineString())); - logger.warn("Test tree2: " + (test.tree2 == null ? "null" : test.tree2.toOneLineString())); - logger.warn("Test expected diff : " + test.difference); - Difference diff = new Difference(test.tree1, test.tree2); - logger.warn("Observed diffs : " + diff); - Assert.assertEquals(diff.toString(), test.difference, "Observed diff string " + diff + " not equal to expected difference string " + test.difference ); - - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java index cc6207ac8..5242414d7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumUnitTest.java @@ -51,8 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.MannWhitneyU; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -81,9 +81,9 @@ public class RankSumUnitTest { makeDistribution(distribution20_40, 40, skew, observations/2); // shuffle the observations - Collections.shuffle(distribution20, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(distribution30, GenomeAnalysisEngine.getRandomGenerator()); - Collections.shuffle(distribution20_40, GenomeAnalysisEngine.getRandomGenerator()); + Collections.shuffle(distribution20, Utils.getRandomGenerator()); + Collections.shuffle(distribution30, Utils.getRandomGenerator()); + Collections.shuffle(distribution20_40, Utils.getRandomGenerator()); } private static void makeDistribution(final List result, final int target, final int skew, final int numObservations) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java index b1d7de93a..ca10b9a1a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/BQSRGathererUnitTest.java @@ -51,10 +51,11 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; +import org.broadinstitute.gatk.engine.recalibration.BQSRGatherer; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.recalibration.RecalUtils; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.engine.recalibration.RecalUtils; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java index acb06c4ea..3b4243831 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/bqsr/ReadRecalibrationInfoUnitTest.java @@ -54,7 +54,7 @@ package org.broadinstitute.gatk.tools.walkers.bqsr; import htsjdk.samtools.SAMUtils; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.recalibration.EventType; -import org.broadinstitute.gatk.utils.recalibration.ReadCovariates; +import org.broadinstitute.gatk.engine.recalibration.ReadCovariates; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.testng.Assert; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java index 015460696..585bb2f8d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUnitTester.java @@ -52,8 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.RandomDNA; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.testng.Assert; import org.testng.SkipException; @@ -69,15 +71,15 @@ import java.util.Set; */ public class AlleleListUnitTester { - private static final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + private static final Random rnd = Utils.getRandomGenerator(); private static final RandomDNA rndDNA = new RandomDNA(rnd); /** * Test that the contents of an allele-list are the ones expected. *

*

- * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList} interface methods. - * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleList} aspect of + * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.utils.genotyper.AlleleList} interface methods. + * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.utils.genotyper.AlleleList} aspect of * the {@code actual} argument. *

* diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java index ddec1a643..7c3a85d53 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtilsUnitTest.java @@ -52,7 +52,11 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; +import org.broadinstitute.gatk.utils.genotyper.AlleleListPermutation; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.testng.Assert; import org.testng.SkipException; import org.testng.annotations.BeforeClass; @@ -62,7 +66,7 @@ import org.testng.annotations.Test; import java.util.*; /** - * Test {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUtils}. + * Test {@link org.broadinstitute.gatk.utils.genotyper.AlleleListUtils}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ @@ -121,7 +125,7 @@ public class AlleleListUtilsUnitTest { Assert.assertTrue(AlleleListUtils.equals(selfPermutation,originalAlleleList)); } - private final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + private final Random rnd = Utils.getRandomGenerator(); @Test(dataProvider = "singleAlleleListData", dependsOnMethods = "testEquals") public void testSubsetPermutation(final List alleles1) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java index 3dab2af2d..d65770d7b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ArtificialReadPileupTestProvider.java @@ -53,14 +53,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.QualityUtils; import org.broadinstitute.gatk.utils.pileup.PileupElement; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; @@ -247,10 +243,10 @@ public class ArtificialReadPileupTestProvider { double errorProbability = QualityUtils.qualToErrorProb((byte)phredScaledErrorRate); for (int k=0; k < readBases.length; k++) { - if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < errorProbability) { + if (Utils.getRandomGenerator().nextDouble() < errorProbability) { // random offset int offset = BaseUtils.simpleBaseToBaseIndex(readBases[k]); //0..3 - offset += (GenomeAnalysisEngine.getRandomGenerator().nextInt(3)+1); // adds 1,2 or 3 + offset += (Utils.getRandomGenerator().nextInt(3)+1); // adds 1,2 or 3 offset %= 4; readBases[k] = BaseUtils.baseIndexToSimpleBase(offset); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java index 1dd02c80c..830f3681d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMUtils; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.MathUtils; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java index a8169dc4f..d3a0864da 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypeLikelihoodCalculatorUnitTest.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -129,7 +129,7 @@ public class GenotypeLikelihoodCalculatorUnitTest { @Test(dataProvider = "ploidyAndMaximumAlleleAndNewMaximumAlleleData") public void testGenotypeIndexMap(final int ploidy, final int oldAlleleCount, final int newAlleleCount) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final int maxAlleleCount = Math.max(oldAlleleCount,newAlleleCount); final int[] alleleMap = new int[newAlleleCount]; final Map> reverseMap = new HashMap<>(oldAlleleCount); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java index 1ec10855f..a96302d0a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingDataUnitTest.java @@ -52,7 +52,10 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.AlleleListUtils; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java index 3e19d2734..e918602da 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HeterogeneousPloidyModel.java @@ -52,8 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.tools.walkers.genotyper.PloidyModel; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; /** * General heterogeneous ploidy model. diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java index 89038bdf6..1b5d97994 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/HomogeneousPloidyModelUnitTest.java @@ -51,6 +51,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java index 920e10b09..e19cfca29 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -53,7 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; import org.broadinstitute.gatk.engine.walkers.Walker; import htsjdk.variant.variantcontext.Allele; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java index 2f8663468..151a2325f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleListUnitTest.java @@ -53,6 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.IndexedAlleleList; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -61,7 +62,7 @@ import java.util.*; import static org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUnitTester.assertAlleleList; /** - * Tests {@link org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList}. + * Tests {@link org.broadinstitute.gatk.utils.genotyper.IndexedSampleList}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java index e79512b1e..098c39c66 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleListUnitTest.java @@ -52,14 +52,15 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; /** - * Tests {@link org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList}. + * Tests {@link org.broadinstitute.gatk.utils.genotyper.IndexedSampleList}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ @@ -119,7 +120,7 @@ public class IndexedSampleListUnitTest { private static final int[] MAX_SAMPLE_INDEX = { 0, 1, 4, 9, 10000}; - private static final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + private static final Random rnd = Utils.getRandomGenerator(); @DataProvider(name="sampleCountMaxSampleIndexData") diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java index 0e10f741a..70fdf5245 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/InfiniteRandomMatingPopulationModelUnitTest.java @@ -53,8 +53,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.GenotypeLikelihoods; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.*; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -93,7 +93,7 @@ public class InfiniteRandomMatingPopulationModelUnitTest { } private AlleleList discardAllelesAtRandom(final AlleleList likelihoods, final int discardAlleleCount) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final ArrayList subset = new ArrayList<>(AlleleListUtils.asList(likelihoods)); for (int i = 0; i < discardAlleleCount; i++) { subset.remove(rnd.nextInt(subset.size())); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java new file mode 100644 index 000000000..bae62b6c5 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/NanoSchedulerIntegrationTest.java @@ -0,0 +1,101 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.genotyper; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class NanoSchedulerIntegrationTest extends WalkerTest { + @DataProvider(name = "NanoSchedulerUGTest") + public Object[][] createNanoSchedulerUGTest() { + List tests = new ArrayList(); + + for ( final int nt : Arrays.asList(1, 2) ) + for ( final int nct : Arrays.asList(1, 2) ) { + tests.add(new Object[]{ "BOTH", "18418ddc2bdbe20c38ece6dd18535be7", nt, nct }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") + private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T UnifiedGenotyper -R " + b37KGReference, + "--no_cmdline_in_header -G none", + "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", + "-L 20:10,000,000-10,100,000", + "-glm " + glm, + "--contamination_fraction_to_filter 0.0", + "-nt " + nt, + "-nct " + nct, + "-o %s" + ), + 1, + Arrays.asList(md5) + ); + executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); + } + + + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java new file mode 100644 index 000000000..f329692ff --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTest.java @@ -0,0 +1,857 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.genotyper; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.*; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.testng.Assert; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Test code for {@link ReadLikelihoods} + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class ReadLikelihoodsUnitTest +{ + private static final double EPSILON = 1e-6; + private static final int ODD_READ_START = 101; + private static final int EVEN_READ_START = 1; + + @Test(dataProvider = "dataSets") + public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + + Assert.assertEquals(result.sampleCount(), samples.length); + Assert.assertEquals(result.alleleCount(), alleles.length); + + + testSampleQueries(samples, reads, result); + testAlleleQueries(alleles, result); + testLikelihoodMatrixQueries(samples, result, null); + } + + @Test(dataProvider = "dataSets") + public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result); + testLikelihoodMatrixQueries(samples, result, likelihoods); + } + + private double[][][] fillWithRandomLikelihoods(final String[] samples, final Allele[] alleles, final ReadLikelihoods result) { + final Random rnd = Utils.getRandomGenerator(); + final double[][][] likelihoods = new double[samples.length][alleles.length][]; + for (int s = 0; s < likelihoods.length; s++) { + final ReadLikelihoods.Matrix sampleLikelihoods = result.sampleMatrix(s); + for (int a = 0; a < likelihoods[s].length; a++) { + likelihoods[s][a] = new double[result.sampleReadCount(s)]; + for (int r = 0; r < likelihoods[s][a].length; r++) + sampleLikelihoods.set(a,r,likelihoods[s][a][r] = -Math.abs(rnd.nextGaussian())); + } + } + return likelihoods; + } + + @Test(dataProvider = "dataSets") + public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + fillWithRandomLikelihoods(samples,alleles,original); + final int alleleCount = alleles.length; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + final double[] bestLkArray = new double[sampleReadCount]; + final int[] bestIndexArray = new int[sampleReadCount]; + final double[] confidenceArray = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + int bestAlleleIndex = -1; + double bestAlleleLk = Double.NEGATIVE_INFINITY; + double secondBestAlleleLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + final double lk = sampleMatrix.get(a,r); + if (lk > bestAlleleLk) { + secondBestAlleleLk = bestAlleleLk; + bestAlleleLk = lk; + bestAlleleIndex = a; + } else if (lk > secondBestAlleleLk) { + secondBestAlleleLk = lk; + } + } + bestLkArray[r] = bestAlleleLk; + confidenceArray[r] = bestAlleleLk - secondBestAlleleLk; + bestIndexArray[r] = bestAlleleIndex; + } + final Collection.BestAllele> bestAlleles = original.bestAlleles(); + for (final ReadLikelihoods.BestAllele bestAllele : bestAlleles) { + final int readIndex = original.readIndex(s,bestAllele.read); + if (readIndex == -1) continue; + Assert.assertEquals(bestLkArray[readIndex],bestAllele.likelihood); + Assert.assertEquals(bestAllele.allele,alleles[bestIndexArray[readIndex]]); + Assert.assertEquals(bestAllele.confidence,confidenceArray[readIndex],EPSILON); + } + } + } + + @Test(dataProvider = "dataSets") + public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + fillWithRandomLikelihoods(samples,alleles,original); + final Map> expected = new HashMap<>(alleles.length); + for (final Allele allele : alleles) + expected.put(allele,new ArrayList()); + + final int alleleCount = alleles.length; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + for (int r = 0; r < sampleReadCount; r++) { + int bestAlleleIndex = -1; + double bestAlleleLk = Double.NEGATIVE_INFINITY; + double secondBestAlleleLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + final double lk = sampleMatrix.get(a,r); + if (lk > bestAlleleLk) { + secondBestAlleleLk = bestAlleleLk; + bestAlleleLk = lk; + bestAlleleIndex = a; + } else if (lk > secondBestAlleleLk) { + secondBestAlleleLk = lk; + } + } + if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD) + expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.readAt(r)); + } + } + + final Map> actual = original.readsByBestAlleleMap(); + + Assert.assertEquals(actual.size(),alleles.length); + for (final Allele allele : alleles) { + final List expectedList = expected.get(allele); + final List actualList = actual.get(allele); + final Set expectedSet = new HashSet<>(expectedList); + final Set actualSet = new HashSet<>(actualList); + Assert.assertEquals(actualSet,expectedSet); + } + } + + @Test(dataProvider = "dataSets") + public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int r = 0; r < sampleReadCount; r++) { + if ((r & 1) == 0) continue; + for (int a = 0; a < alleles.length; a++) + original.sampleMatrix(s).set(a,r,-10000); + } + } + + final ReadLikelihoods result = original.clone(); + result.filterPoorlyModeledReads(2.0); + + for (int s = 0; s < samples.length; s++) { + final int oldSampleReadCount = original.sampleReadCount(s); + final int newSampleReadCount = result.sampleReadCount(s); + Assert.assertEquals(newSampleReadCount,(oldSampleReadCount + 1) / 2); + final ReadLikelihoods.Matrix newSampleMatrix = result.sampleMatrix(s); + final ReadLikelihoods.Matrix oldSampleMatrix = original.sampleMatrix(s); + for (int r = 0 ; r < newSampleReadCount; r++) { + Assert.assertEquals(original.readIndex(s, result.sampleReads(s).get(r)), r * 2); + for (int a = 0; a < alleles.length; a++) { + Assert.assertEquals(newSampleMatrix.get(a,r),oldSampleMatrix.get(a,r*2)); + } + } + } + } + + @Test(dataProvider = "dataSets") + public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); + fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + result.filterToOnlyOverlappingUnclippedReads(evenReadOverlap); + final double[][][] newLikelihoods = new double[samples.length][alleles.length][]; + for (int s = 0; s < samples.length ; s++) + for (int a = 0; a < alleles.length; a++) { + newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2]; + final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); + for (int r = 0; r < newLikelihoods[s][a].length; r++) { + Assert.assertEquals(result.readIndex(s,sampleMatrix.readAt(r << 1)),r); + newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1); + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "marginalizationDataSets") + public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); + fillWithRandomLikelihoods(samples, alleles, original); + final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap); + Assert.assertNotNull(marginalized); + Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); + for (int a = 0; a < marginalized.alleleCount(); a++) { + final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); + Assert.assertNotNull(oldAlleles); + for (int s = 0; s < samples.length; s++) { + final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); + final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); + final int sampleReadCount = sampleLikelihoods.readCount(); + final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); + Assert.assertEquals(sampleReadCount,(oldSampleReadCount + 1) / 2); + for (int r = 0; r < sampleReadCount; r++) { + double oldBestLk = Double.NEGATIVE_INFINITY; + for (final Allele oldAllele : oldAlleles) { + oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r << 1), oldBestLk); + } + Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); + } + } + } + } + + @Test(dataProvider = "marginalizationDataSets") + public void testMarginalization(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + fillWithRandomLikelihoods(samples, alleles, original); + final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping); + Assert.assertNotNull(marginalized); + Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); + for (int a = 0; a < marginalized.alleleCount(); a++) { + final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); + Assert.assertNotNull(oldAlleles); + for (int s = 0; s < samples.length; s++) { + final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); + final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); + final int sampleReadCount = sampleLikelihoods.readCount(); + final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); + Assert.assertEquals(oldSampleReadCount,sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) { + double oldBestLk = Double.NEGATIVE_INFINITY; + for (final Allele oldAllele : oldAlleles) { + oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r), oldBestLk); + } + Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); + } + } + } + } + + @Test(dataProvider = "dataSets") + public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestLk = originalLikelihoods[s][0][r]; + for (int a = 1; a < alleleCount; a++) { + bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); + } + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") + public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(false, - 0.001); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestAltLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + if (alleles[a].isReference()) + continue; + bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); + } + if (bestAltLk == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r]; + } + else + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001); + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + @Test(dataProvider = "dataSets") + public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result= original.clone(); + result.normalizeLikelihoods(true, - 0.001); + testAlleleQueries(alleles,result); + final int alleleCount = alleles.length; + final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; + for (int s = 0; s < samples.length; s++) { + final int sampleReadCount = original.sampleReadCount(s); + for (int a = 0; a < alleleCount; a++) + newLikelihoods[s][a] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestAltLk = Double.NEGATIVE_INFINITY; + double bestLk = Double.NEGATIVE_INFINITY; + for (int a = 0; a < alleleCount; a++) { + bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); + if (alleles[a].isReference()) + continue; + bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); + } + if (bestAltLk == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; + } + else + for (int a = 0; a < alleleCount; a++) { + newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001) - bestLk; + } + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + + @Test(dataProvider = "dataSets") + public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + + // If all the alleles pass are present in the read-likelihoods collection there is no change. + result.addMissingAlleles(result.alleles(),Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + // If the allele list passed is empty there is no effect. + result.addMissingAlleles(Collections.EMPTY_LIST,Double.NEGATIVE_INFINITY); + testLikelihoodMatrixQueries(samples,result,originalLikelihoods); + + final Allele newOne; + final Allele newTwo; + final Allele newThree; + + // We add a single missing. + result.addMissingAlleles(Arrays.asList(newOne = Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-12345.6); + Assert.assertEquals(result.alleleCount(), original.alleleCount() + 1); + + // We add too more amongst exisisting alleles: + result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.alleleAt(1), + result.alleleAt(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); + + Assert.assertEquals(original.alleleCount()+3,result.alleleCount()); + + final List expectedAlleles = new ArrayList<>(original.alleles()); + expectedAlleles.add(newOne); expectedAlleles.add(newTwo); expectedAlleles.add(newThree); + + Assert.assertEquals(result.alleles(),expectedAlleles); + + final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; + for (int s = 0; s < samples.length; s++) { + newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 3); + final int sampleReadCount = original.sampleReadCount(s); + final int originalAlleleCount = originalLikelihoods[s].length; + newLikelihoods[s][originalAlleleCount] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount],-12345.6); + newLikelihoods[s][originalAlleleCount+1] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+1],-6.54321); + newLikelihoods[s][originalAlleleCount+2] = new double[sampleReadCount]; + Arrays.fill(newLikelihoods[s][originalAlleleCount+2],-6.54321); + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + + @Test(dataProvider = "dataSets") + public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { + final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); + final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); + final ReadLikelihoods result = original.clone(); + result.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + Assert.assertEquals(result.alleleCount(),original.alleleCount() + 1); + Assert.assertEquals(result.alleleIndex(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE),result.alleleCount() - 1); + final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; + for (int s = 0; s < samples.length; s++) { + newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 1); + final int sampleReadCount = original.sampleReadCount(s); + final int ordinaryAlleleCount = originalLikelihoods[s].length; + newLikelihoods[s][ordinaryAlleleCount] = new double[sampleReadCount]; + for (int r = 0; r < sampleReadCount; r++) { + double bestLk = newLikelihoods[s][0][r]; + double secondBestLk = Double.NEGATIVE_INFINITY; + for (int a = 1; a < ordinaryAlleleCount; a++) { + final double lk = originalLikelihoods[s][a][r]; + if (lk > bestLk) { + secondBestLk = bestLk; + bestLk = lk; + } else if (lk > secondBestLk) { + secondBestLk = lk; + } + } + final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk; + newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk; + } + } + testLikelihoodMatrixQueries(samples,result,newLikelihoods); + } + + private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) { + for (final String sample : samples) { + final int sampleIndex = result.sampleIndex(sample); + final int sampleReadCount = result.sampleReadCount(sampleIndex); + final int alleleCount = result.alleleCount(); + Assert.assertEquals(result.alleleCount(), alleleCount); + for (int a = 0; a < alleleCount; a++) { + Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a,r), + likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); + } + } + } + + private void testAlleleQueries(Allele[] alleles, ReadLikelihoods result) { + final Set alleleIndices = new HashSet<>(); + for (final Allele allele : alleles) { + final int alleleIndex = result.alleleIndex(allele); + Assert.assertTrue(alleleIndex >= 0); + Assert.assertFalse(alleleIndices.contains(alleleIndex)); + alleleIndices.add(alleleIndex); + Assert.assertSame(allele,alleles[alleleIndex]); + } + } + + private void testSampleQueries(String[] samples, Map> reads, ReadLikelihoods result) { + final Set sampleIds = new HashSet<>(samples.length); + for (final String sample : samples) { + final int sampleIndex = result.sampleIndex(sample); + Assert.assertTrue(sampleIndex >= 0); + Assert.assertFalse(sampleIds.contains(sampleIndex)); + sampleIds.add(sampleIndex); + + final List sampleReads = result.sampleReads(sampleIndex); + final Set sampleReadsSet = new HashSet<>(sampleReads); + final List expectedSampleReadArray = reads.get(sample); + final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); + Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); + + final int sampleReadCount = sampleReads.size(); + for (int r = 0; r < sampleReadCount; r++) { + Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); + final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); + Assert.assertEquals(readIndex,r); + } + } + } + + private String[][] SAMPLE_SETS = new String[][] { + {"A","B","C"}, + {"A"}, + {"C","A","D","E","Salsa","Gazpacho"}, + }; + + private Allele[][] ALLELE_SETS = new Allele[][] { + {Allele.create("A",true), Allele.create("T"), Allele.create("C")}, + {Allele.create("A",true)}, + {Allele.create("ATTTA"), Allele.create("A",true)}, + {Allele.create("A"), Allele.create("AT",true)}, + {Allele.create("A",false), Allele.create("AT",false)}, + }; + + @DataProvider(name="marginalizationDataSets") + public Object[][] marginalizationDataSets() { + try { + final Random rnd = Utils.getRandomGenerator(); + final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length * ALLELE_SETS.length][]; + int nextIndex = 0; + for (int s = 0; s < SAMPLE_SETS.length; s++) { + for (int a = 0; a < ALLELE_SETS.length; a++) { + for (int b = 0; b < ALLELE_SETS.length; b++) { + if (ALLELE_SETS[b].length < ALLELE_SETS[a].length) + result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], + dataSetReads(SAMPLE_SETS[s], rnd), randomAlleleMap(ALLELE_SETS[a], ALLELE_SETS[b]) + }; + } + } + } + return Arrays.copyOf(result,nextIndex); + }catch (final Throwable e) { + throw new RuntimeException(e); + } + } + + private Map> randomAlleleMap(final Allele[] fromAlleles, final Allele[] toAlleles) { + final Map> result = new HashMap<>(toAlleles.length); + for (final Allele toAllele : toAlleles ) + result.put(toAllele,new ArrayList(fromAlleles.length)); + final ArrayList remaining = new ArrayList<>(Arrays.asList(fromAlleles)); + int nextToIndex = 0; + final Random rnd = Utils.getRandomGenerator(); + for (int i = 0; i < fromAlleles.length; i++) { + final int fromAlleleIndex = rnd.nextInt(remaining.size()); + result.get(toAlleles[nextToIndex]).add(remaining.remove(fromAlleleIndex)); + nextToIndex = (nextToIndex + 1) % toAlleles.length; + } + return result; + } + + + @DataProvider(name="dataSets") + public Object[][] dataSets() { + try { + final Random rnd = Utils.getRandomGenerator(); + final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length][]; + int nextIndex = 0; + for (int s = 0; s < SAMPLE_SETS.length; s++) + for (int a = 0; a < ALLELE_SETS.length; a++) { + result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], + dataSetReads(SAMPLE_SETS[s], rnd) + }; + } + return result; + }catch (final Throwable e) { + throw new RuntimeException(e); + } + } + + private Map> dataSetReads(final String[] samples, + final Random rnd) { + final Map> result = new HashMap<>(samples.length); + for (final String sample : samples) { + final int readCount = rnd.nextInt(100); + final List reads = new ArrayList<>(readCount); + for (int r = 0; r < readCount; r++) { + final int alignmentStart = (r & 1) == 0 ? EVEN_READ_START : ODD_READ_START; + reads.add(ArtificialSAMUtils.createArtificialRead(SAM_HEADER, + "RRR" + sample + "00" + r, 0, alignmentStart ,"AAAAA".getBytes(), new byte[] {30,30,30,30,30}, "5M")); + } + result.put(sample,reads); + } + return result; + } + + @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") + public void testInstantiationAndBasicQueries(final int[] readCounts, final int alleleCount, final boolean hasReference) { + final SampleList sampleList = sampleList(readCounts); + + final AlleleList alleleList = alleleList(alleleCount,hasReference); + final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList, readCounts); + final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); + + AlleleListUnitTester.assertAlleleList(subject, AlleleListUtils.asList(alleleList)); + SampleListUnitTester.assertSampleList(subject,SampleListUtils.asList(sampleList)); + + if (hasReference) { + final int referenceIndex = AlleleListUtils.indexOfReference(alleleList); + Assert.assertTrue(referenceIndex >= 0); + Assert.assertEquals(AlleleListUtils.indexOfReference(alleleList),referenceIndex); + } else { + Assert.assertEquals(AlleleListUtils.indexOfReference(subject), -1); + } + + testLikelihoodMatrixQueries(alleleList, sampleList, sampleToReads, subject); + testAlleleQueries(alleleList, subject); + testSampleQueries(sampleList, sampleToReads, subject); + } + + @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") + public void testLikelihoodWriting(final int[] readCounts, final int alleleCount, final boolean hasReference) { + final SampleList sampleList = sampleList(readCounts); + + final AlleleList alleleList = alleleList(alleleCount,hasReference); + final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); + final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); + + final int sampleCount = readCounts.length; + int totalLikelihoodsSet = 0; + int expectedLikelihoodsSet = 0; + for (int s = 0; s < sampleCount; s++) { + expectedLikelihoodsSet += readCounts[s] * alleleCount; + final ReadLikelihoods.Matrix matrix = subject.sampleMatrix(s); + final int readCount = matrix.readCount(); + for (int a = 0; a < alleleCount; a++) + for (int r = 0; r < readCount; r++) { + final double likelihood = testLikelihood(s, a, r); + Assert.assertNotEquals(likelihood,0); //Paranoia + totalLikelihoodsSet++; + matrix.set(a,r,likelihood); + Assert.assertEquals(matrix.get(a, r),likelihood); + } + + } + Assert.assertEquals(totalLikelihoodsSet,expectedLikelihoodsSet); + } + + @Test(dependsOnMethods={"testLikelihoodWriting","testInstantiationAndBasicQueries"}, + dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") + public void testMapConversion(final int[] readCounts, final int alleleCount, final boolean hasReference) { + final SampleList sampleList = sampleList(readCounts); + + final AlleleList alleleList = alleleList(alleleCount,hasReference); + final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); + + final Set alleleWithLikelihoodsSet = new HashSet<>(); + final Set readsWithLikelihoodsSet = new HashSet<>(); + final Map map = new HashMap<>(sampleList.sampleCount()); + final int sampleCount = sampleList.sampleCount(); + for (int s = 0; s < sampleCount; s++) { + final String sample = sampleList.sampleAt(s); + final PerReadAlleleLikelihoodMap perSampleMap = new PerReadAlleleLikelihoodMap(); + final List reads = sampleToReads.get(sample); + for (int a = 0; a < alleleCount; a++) + for (int r = 0; r < reads.size(); r++) { + perSampleMap.add(reads.get(r), alleleList.alleleAt(a), testLikelihood(s, a, r)); + alleleWithLikelihoodsSet.add(alleleList.alleleAt(a)); + readsWithLikelihoodsSet.add(reads.get(r)); + } + map.put(sample,perSampleMap); + + } + + ReadLikelihoods subject = ReadLikelihoods.fromPerAlleleReadLikelihoodsMap(map); + + for (int s = 0; s < sampleCount; s++) { + final String sample = sampleList.sampleAt(s); + final int sIndex = subject.sampleIndex(sample); + Assert.assertTrue(sIndex >= 0); + Assert.assertTrue(sIndex < sampleCount); + final int sampleReadCount = sampleToReads.get(sample).size(); + final ReadLikelihoods.Matrix sampleLikelihoods = subject.sampleMatrix(sIndex); + for (int a = 0; a < alleleCount; a++) { + final Allele allele = alleleList.alleleAt(a); + final int aIndex = subject.alleleIndex(allele); + Assert.assertEquals(aIndex >= 0,alleleWithLikelihoodsSet.contains(allele)); + Assert.assertTrue(aIndex < alleleCount); + if (aIndex == -1) continue; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleToReads.get(sample).get(r); + final int rIndex = subject.readIndex(sIndex,read); + final int rIndex2 = sampleLikelihoods.readIndex(read); + Assert.assertEquals(rIndex,rIndex2); + Assert.assertEquals(rIndex >= 0,readsWithLikelihoodsSet.contains(read)); + Assert.assertTrue(rIndex < sampleReadCount); + if (rIndex == -1) + continue; + final double likelihood = sampleLikelihoods.get(aIndex,rIndex); + Assert.assertEquals(likelihood,testLikelihood(s,a,r)); + } + } + } + } + + private double testLikelihood(final int sampleIndex, final int alleleIndex, final int readIndex) { + return - Math.abs(31 * (sampleIndex + 1) + 101 * alleleIndex + 1009 * readIndex); + } + + + private final Random rnd = Utils.getRandomGenerator(); + + private void testLikelihoodMatrixQueries(final AlleleList alleles, final SampleList samples, + final Map> sampleToReads, ReadLikelihoods result) { + for (final String sample : SampleListUtils.asList(samples)) { + final int sampleIndex = result.sampleIndex(sample); + final ReadLikelihoods.Matrix likelihoodMatrix = result.sampleMatrix(sampleIndex); + final int sampleReadCount = sampleToReads.get(sample).size(); + final List reads = sampleToReads.get(sample); + Assert.assertEquals(likelihoodMatrix.alleleCount(), alleles.alleleCount()); + Assert.assertEquals(likelihoodMatrix.readCount(), sampleReadCount); + for (int a = 0; a < likelihoodMatrix.alleleCount(); a++) { + Assert.assertEquals(likelihoodMatrix.alleleAt(a),alleles.alleleAt(a)); + for (int r = 0; r < sampleReadCount; r++) { + Assert.assertEquals(likelihoodMatrix.readAt(r),reads.get(r)); + Assert.assertEquals(likelihoodMatrix.get(a, r), 0.0); + } + } + } + } + + private void testAlleleQueries(final AlleleList alleles, ReadLikelihoods result) { + final Set alleleIndices = new HashSet<>(); + for (final Allele allele : AlleleListUtils.asList(alleles)) { + final int alleleIndex = result.alleleIndex(allele); + Assert.assertTrue(alleleIndex >= 0); + Assert.assertFalse(alleleIndices.contains(alleleIndex)); + alleleIndices.add(alleleIndex); + Assert.assertSame(allele,alleles.alleleAt(alleleIndex)); + } + } + + private void testSampleQueries(final SampleList samples, Map> reads, + final ReadLikelihoods result) { + final Set sampleIds = new HashSet<>(samples.sampleCount()); + for (final String sample : SampleListUtils.asList(samples)) { + final int sampleIndex = result.sampleIndex(sample); + Assert.assertTrue(sampleIndex >= 0); + Assert.assertFalse(sampleIds.contains(sampleIndex)); + sampleIds.add(sampleIndex); + + final List sampleReads = result.sampleReads(sampleIndex); + final Set sampleReadsSet = new HashSet<>(sampleReads); + final List expectedSampleReadArray = reads.get(sample); + final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); + Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); + + final int sampleReadCount = sampleReads.size(); + for (int r = 0; r < sampleReadCount; r++) { + Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); + final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); + Assert.assertEquals(readIndex,r); + } + } + } + + private AlleleList alleleList(final int alleleCount, final boolean hasReference) { + final Allele[] alleles = AlleleListUnitTester.generateRandomAlleles(alleleCount,100); + if (hasReference) { + final int referenceIndex = rnd.nextInt(alleleCount); + alleles[referenceIndex] = Allele.create(alleles[referenceIndex].getBases(),true); + } + final AlleleList alleleList = new IndexedAlleleList<>(alleles); + if (alleleList.alleleCount() != alleles.length) + throw new SkipException("repeated alleles, should be infrequent"); + return alleleList; + } + + private SAMFileHeader SAM_HEADER = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 1000); + final GenomeLocParser locParser = new GenomeLocParser(SAM_HEADER.getSequenceDictionary()); + + + private int[][] READ_COUNTS = new int[][] { + {}, + { 100 }, + { 0 }, + { 0, 0, 0 }, + { 1, 0, 1 }, + { 100, 10 , 100}, + { 1000, 10, 100, 20, 23 } + }; + + private int[] ALLELE_COUNTS = new int[] { 0, 1, 2, 3, 10, 20 }; + + @DataProvider(name="readCountsAndAlleleCountData") + public Object[][] readCountsAndAlleleCountData() { + final Object[][] result = new Object[READ_COUNTS.length * ALLELE_COUNTS.length * 2][]; + int index = 0; + for (final int[] readCounts : READ_COUNTS) + for (final int alleleCount : ALLELE_COUNTS) { + result[index++] = new Object[]{ readCounts, alleleCount, false}; + result[index++] = new Object[]{ readCounts, alleleCount, true}; + } + return result; + } + + @DataProvider(name="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") + public Object[][] readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference() { + final Object[][] raw = readCountsAndAlleleCountData(); + final List result = new ArrayList<>(raw.length); + for (final Object[] paramSet : raw) + if (!paramSet[2].equals(true) || !paramSet[1].equals(0)) + result.add(paramSet); + return result.toArray(new Object[result.size()][]); + } + + @DataProvider(name="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") + public Object[][] readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference() { + final Object[][] raw = readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference(); + final List result = new ArrayList<>(raw.length); + for (final Object[] paramSet : raw) { + final int[] readCounts = (int[]) paramSet[0]; + final long totalReadCount = MathUtils.sum(readCounts); + if (totalReadCount > 0) + result.add(paramSet); + } + return result.toArray(new Object[result.size()][]); + } + + private SampleList sampleList(final int[] readCounts) { + final List samples = new ArrayList<>(readCounts.length); + for (int i = 0; i < readCounts.length; i++) + samples.add("SAMPLE_" + i); + return new IndexedSampleList(samples); + } + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java index 44f8279e1..0aededd99 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/ReadLikelihoodsUnitTester.java @@ -54,7 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.SAMFileHeader; import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.genotyper.AlleleList; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java index f66990d75..c79acccbc 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUnitTester.java @@ -51,8 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; import org.testng.Assert; import java.util.*; @@ -68,8 +68,8 @@ public class SampleListUnitTester { * Test that the contents of a sample-list are the ones expected. * *

- * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.tools.walkers.genotyper.SampleList} interface methods. - * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.tools.walkers.genotyper.SampleList} aspect of + * This method perform various consistency check involving all the {@link org.broadinstitute.gatk.utils.genotyper.SampleList} interface methods. + * Therefore calling this method is equivalent to a thorough check of the {@link org.broadinstitute.gatk.utils.genotyper.SampleList} aspect of * the {@code actual} argument. *

* diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java index 565d0cc47..4575e62c1 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtilsUnitTest.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -64,7 +64,7 @@ import java.util.Arrays; import java.util.List; /** - * Test {@link org.broadinstitute.gatk.tools.walkers.genotyper.AlleleListUtils}. + * Test {@link org.broadinstitute.gatk.utils.genotyper.AlleleListUtils}. * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java index 780ab3e0d..4781488a9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/StandardCallerArgumentCollectionUnitTest.java @@ -51,9 +51,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection; +import org.broadinstitute.gatk.utils.Utils; import org.testng.SkipException; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -151,7 +151,7 @@ public class StandardCallerArgumentCollectionUnitTest { public T randomArgumentCollection(final Class clazz) throws IllegalAccessException, InstantiationException { final T result = clazz.newInstance(); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (final Field field : clazz.getFields()) { final int fieldModifiers = field.getModifiers(); if (!Modifier.isPublic(fieldModifiers)) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java index 3f38d20e9..4d7b1568b 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperEngineUnitTest.java @@ -65,6 +65,8 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.afcalc.FixedAFCalculatorP import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e975c11d0..a80bac9a8 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -54,9 +54,9 @@ package org.broadinstitute.gatk.tools.walkers.genotyper; import htsjdk.samtools.util.BlockCompressedInputStream; import htsjdk.tribble.readers.AsciiLineReader; import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.testng.Assert; @@ -246,14 +246,14 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList(md5)); executeTest("test parallelization (single thread)", spec1); - GenomeAnalysisEngine.resetRandomGenerator(); + Utils.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, Arrays.asList(md5)); executeTest("test parallelization (2 threads)", spec2); - GenomeAnalysisEngine.resetRandomGenerator(); + Utils.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( myCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java index 325fbf962..9fddd1722 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/FixedAFCalculatorProviderUnitTest.java @@ -54,7 +54,7 @@ package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; import org.broadinstitute.gatk.engine.arguments.GenotypeCalculationArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.gatk.tools.walkers.genotyper.StandardCallerArgumentCollection; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java index 629bcbbf9..366b90bb7 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AFPriorProviderUnitTest.java @@ -51,12 +51,12 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.tools.walkers.genotyper.AFPriorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.CustomAFPriorProvider; import org.broadinstitute.gatk.tools.walkers.genotyper.HeterozygosityAFPriorProvider; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -77,7 +77,7 @@ public class AFPriorProviderUnitTest extends BaseTest { @Test(dataProvider="HeterozygosityProviderData") public void testHeterozygosityProvider(final double h, final int useCount, final int minPloidy, final int maxPloidy) { final double het = h / maxPloidy; - final Random rdn = GenomeAnalysisEngine.getRandomGenerator(); + final Random rdn = Utils.getRandomGenerator(); final int[] plodies = new int[useCount]; for (int i = 0; i < useCount; i++) plodies[i] = rdn.nextInt(maxPloidy - minPloidy + 1) + minPloidy; @@ -100,7 +100,7 @@ public class AFPriorProviderUnitTest extends BaseTest { @Test(dataProvider="CustomProviderData") public void testCustomProvider(final int ploidy) { final double[] priors = new double[ploidy]; - final Random rdn = GenomeAnalysisEngine.getRandomGenerator(); + final Random rdn = Utils.getRandomGenerator(); double remaining = 1; final List priorsList = new ArrayList(); for (int i = 0; i < priors.length; i++) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSet.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSet.java new file mode 100644 index 000000000..81afc1816 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSet.java @@ -0,0 +1,590 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** +* Mock-up active region data used in testing. +* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +public class ActiveRegionTestDataSet { + + private final byte[] referenceBytes; + protected String reference; + protected String[] haplotypeCigars; + protected List haplotypeStrings; + protected String[] readCigars; + protected byte[] bq; + protected byte[] dq; + protected byte[] iq; + protected int kmerSize; + private List haplotypeList; + private List readList; + private AssemblyResultSet assemblyResultSet; + private Map readBySequence; + private String stringRepresentation; + private List> readEventOffsetList; + private GenomeLocParser genomeLocParser; + + /** Create a new active region data test set */ + public ActiveRegionTestDataSet(final int kmerSize, final String reference, final String[] haplotypes, + final String[] readCigars, final byte[] bq, final byte[] dq, final byte[] iq) { + this.reference = reference; + this.referenceBytes = reference.getBytes(); + this.haplotypeCigars = haplotypes; + this.readCigars = readCigars; + this.bq = bq; + this.dq = dq; + this.iq = iq; + this.kmerSize = kmerSize; + this.genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1,1,reference.length()).getSequenceDictionary()); + } + + public String getReference() { + return reference; + } + + public String toString() { + if (stringRepresentation == null) + return super.toString(); + else return stringRepresentation; + } + + public AssemblyResultSet assemblyResultSet() { + if (assemblyResultSet == null) { + final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); + rtg.addSequence("anonymous", this.getReference().getBytes(), true); + for (final String haplotype : this.haplotypesStrings()) { + rtg.addSequence("anonymous", haplotype.getBytes(), false); + } + rtg.buildGraphIfNecessary(); + if (rtg.hasCycles()) + throw new RuntimeException("there is cycles in the reference with kmer size " + kmerSize + ". Don't use this size for the benchmark or change the reference"); + + List haplotypeList = this.haplotypeList(); + + assemblyResultSet = new AssemblyResultSet(); + final AssemblyResult ar = new AssemblyResult((haplotypeList.size() > 1 ? + AssemblyResult.Status.ASSEMBLED_SOME_VARIATION : AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE),rtg.convertToSequenceGraph()); + ar.setThreadingGraph(rtg); + + for (final Haplotype h : haplotypeList) + assemblyResultSet.add(h, ar); + } + return assemblyResultSet; + } + + public List haplotypesStrings() { + if (haplotypeStrings != null) { + return haplotypeStrings; + } + final List result = new ArrayList<>(haplotypeCigars.length); + String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllCombinations(cigar.substring(6),reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(applyCigar(reference, cigar,0,true)); + } else { + result.add(cigar); + } + } + haplotypeStrings = result; + return result; + } + + private List expandAllCombinations(final String cigarString, final String reference) { + final Civar civar = Civar.fromCharSequence(cigarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + result.add(c.applyTo(reference)); + } + return result; + } + + private List expandAllHaplotypeCombinations(final String civarString, final String reference) { + final Civar civar = Civar.fromCharSequence(civarString); + final List unrolledCivars = civar.optionalizeAll().unroll(); + List result = new ArrayList<>(unrolledCivars.size()); + for (final Civar c : unrolledCivars) { + final String baseString = c.applyTo(reference); + final Haplotype haplotype = new Haplotype(baseString.getBytes(),baseString.equals(reference)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + try { + haplotype.setCigar(c.toCigar(reference.length())); + } catch (final RuntimeException ex) { + c.applyTo(reference); + c.toCigar(reference.length()); + throw new RuntimeException("" + c + " " + ex.getMessage(),ex); + } + result.add(haplotype); + } + return result; + } + + + public List haplotypeList() { + if (haplotypeList == null) { + + final List result = new ArrayList<>(haplotypeCigars.length); + final String reference = this.reference; + for (final String cigar : haplotypeCigars) { + if (cigar.matches("^Civar:.*$")) { + stringRepresentation = cigar.substring(6); + result.addAll(expandAllHaplotypeCombinations(cigar.substring(6), reference)); + } else if (cigar.matches("^.*\\d+.*$")) { + result.add(cigarToHaplotype(reference, cigar, 0, true)); + } else { + final Haplotype h = new Haplotype(cigar.getBytes()); + h.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + result.add(h); + } + } + haplotypeList = result; + } + return haplotypeList; + } + + + protected SAMSequenceDictionary artificialSAMSequenceDictionary() { + return new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("00",reference.length()))); + } + + protected SAMFileHeader artificialSAMFileHeader() { + return ArtificialSAMUtils.createArtificialSamHeader(artificialSAMSequenceDictionary()); + } + + public List readList() { + if (readList == null) { + final SAMFileHeader header = artificialSAMFileHeader(); + readList = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + int count = 0; + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readList.addAll(generateSamRecords(haplotypes, readCount, readLength, header, count)); + } else { + sequence = descr; + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); + readList.add(new MyGATKSAMRecord(samRecord)); + } + count = readList.size(); + } + } + return readList; + } + + public List> readEventOffsetList() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + if (readEventOffsetList == null) { + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + final List unrolledCivars = civar.optionalizeAll().unroll(); + + readEventOffsetList = new ArrayList<>(readCigars.length); + int count = 0; + for (final String descr : readCigars) { + if (descr.matches("^\\d+:\\d+:.+$")) { + throw new UnsupportedOperationException(); + } else if (descr.matches("^\\*:\\d+:\\d+$")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + readEventOffsetList.addAll(generateElementOffsetRecords(haplotypesStrings(), unrolledCivars, readCount, readLength, count)); + } else { + throw new UnsupportedOperationException(); + } + count = readEventOffsetList.size(); + } + readEventOffsetList = Collections.unmodifiableList(readEventOffsetList); + } + return readEventOffsetList; + } + + + + + @SuppressWarnings("unused") + public String cigarToSequence(final String cigar) { + String reference = this.reference; + return applyCigar(reference, cigar,0,true); + } + + @SuppressWarnings("unused") + public GATKSAMRecord readFromString(final String readSequence) { + if (readBySequence == null) { + final List readList = readList(); + readBySequence = new HashMap<>(readList.size()); + for (final GATKSAMRecord r : readList) + readBySequence.put(r.getReadString(),r); + } + return readBySequence.get(readSequence); + } + + public List unrolledCivars() { + if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) + throw new UnsupportedOperationException(); + final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); + return civar.optionalizeAll().unroll(); + } + + public void introduceErrors(final Random rnd) { + final List reads = readList(); + final ArrayList result = new ArrayList<>(reads.size()); + for (final GATKSAMRecord read : reads) { + result.add(new MyGATKSAMRecord(read,rnd)); + } + readList = result; + } + + private class MyGATKSAMRecord extends GATKSAMRecord { + protected MyGATKSAMRecord(final GATKSAMRecord r) { + super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), + (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), + r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), + new byte[0]); + this.setReadBases(r.getReadBases()); + this.setBaseQualities(r.getBaseQualities()); + this.setReadName(r.getReadName()); + } + + ExponentialDistribution indelLengthDist = MathUtils.exponentialDistribution(1.0 / 0.9); + + public MyGATKSAMRecord(final GATKSAMRecord r, final Random rnd) { + super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), + (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), + r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), + new byte[0]); + final byte[] bases = new byte[r.getReadBases().length]; + + final byte[] readBases = r.getReadBases(); + final byte[] bq = r.getBaseQualities(); + final byte[] iq = r.getBaseInsertionQualities(); + final byte[] dq = r.getBaseDeletionQualities(); + int refOffset = r.getAlignmentStart() - 1; + int readOffset = 0; + for (int i = 0; i < r.getReadBases().length;) { + double p = rnd.nextDouble(); + double iqp = QualityUtils.qualToErrorProb(iq[i]); + if (p < iqp) { // insertion + final int length = Math.min(generateIndelLength(rnd),r.getReadBases().length - i); + final int refStart = rnd.nextInt(reference.length() - length); + System.arraycopy(referenceBytes,refStart,bases,i,length); + i += length; + continue; + } + p -= iqp; + double dqp = QualityUtils.qualToErrorProb(dq[i]); + if (p < dqp) { + final int length = generateIndelLength(rnd); + refOffset += length; + refOffset = refOffset % referenceBytes.length; + readOffset += length; + continue; + } + p -= dqp; + double bqp = QualityUtils.qualToErrorProb(bq[i]); + byte b = readOffset < readBases.length ? readBases[readOffset] : referenceBytes[refOffset]; + byte nb; + if (p < bqp) { + switch (b) { + case 'A': nb = 'C'; break; + case 'T': nb = 'A'; break; + case 'C': nb = 'G'; break; + case 'G': nb = 'B'; break; + default: nb = 'A'; + } + } else + nb = b; + + bases[i++] = nb; + refOffset++; + refOffset = refOffset % referenceBytes.length; + readOffset++; + } + this.setReadBases(bases); + this.setBaseQualities(r.getBaseQualities()); + this.setReadName(r.getReadName()); + + + } + + private int generateIndelLength(final Random rnd) { + final int length; + try { + length = (int) Math.round(indelLengthDist.inverseCumulativeProbability(rnd.nextDouble()) + 1); + } catch (Exception e) { + throw new RuntimeException(e); + } + return length; + } + + @Override + public byte[] getBaseDeletionQualities() { + return Arrays.copyOf(dq,getReadLength()); + } + + @Override + public byte[] getBaseInsertionQualities() { + return Arrays.copyOf(iq,getReadLength()); + } + + @Override + public int getMappingQuality() { + return 100; + } + + @Override + public int hashCode() { + return getReadName().hashCode(); + } + + @Override + public boolean equals(Object o) { + if (o instanceof GATKSAMRecord) { + return getReadName().equals(((GATKSAMRecord)o).getReadName()); + } else { + return false; + } + } + + public String toString() { + return super.toString() + " " + this.getReadString(); + } + } + + + public List readStrings() { + final List result = new ArrayList<>(readCigars.length); + final List haplotypes = haplotypesStrings(); + for (final String descr : readCigars) { + String sequence; + if (descr.matches("^\\d+:\\d+:.+$")) { + final String[] parts = descr.split(":"); + int allele = Integer.valueOf(parts[0]); + int offset = Integer.valueOf(parts[1]); + final String cigar = parts[2]; + final String base = allele == 0 ? reference : haplotypes.get(allele - 1); + sequence = applyCigar(base, cigar, offset, false); + result.add(sequence); + } else if (descr.matches("\\*:^\\d+:\\d+")) { + int readCount = Integer.valueOf(descr.split(":")[1]); + int readLength = Integer.valueOf(descr.split(":")[2]); + result.addAll(generateReads(haplotypes, readCount, readLength)); + } else { + sequence = descr; + result.add(sequence); + } + } + return result; + } + + private List generateReads(final List haplotypes, final int readCount, final int readLength) { + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = i % h.length() - readLength; + result.add(h.substring(offset,offset + readLength)); + } + return result; + } + + private List generateSamRecords(final List haplotypes, final int readCount, final int readLength, final SAMFileHeader header, final int idStart) { + int id = idStart; + final List result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % haplotypes.size(); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + byte[] bases = h.substring(offset,to).getBytes(); + byte[] quals = Arrays.copyOf(bq,to - offset); + final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header,"read_" + id++,0,offset + 1,bases, quals); + result.add(new MyGATKSAMRecord(samRecord)); + } + return result; + } + + + private List> generateElementOffsetRecords(final List haplotypes, final List unrolledCivars, final int readCount, final int readLength, final int count) { + + final List> result = new ArrayList<>(readCount); + for (int i = 0; i < readCount; i++) { + int hi = i % unrolledCivars.size(); + final Civar c = unrolledCivars.get(hi); + final String h = haplotypes.get(hi); + int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); + int to = Math.min(h.length(),offset + readLength); + result.add(c.eventOffsets(reference,offset,to)); + } + return result; + } + + private static final Pattern cigarPattern = Pattern.compile("(\\d+)([=A-Z])"); + + + private Haplotype cigarToHaplotype(final String reference, final String cigar, final int offset, final boolean global) { + final String sequence = applyCigar(reference,cigar,offset,global); + final Haplotype haplotype = new Haplotype(sequence.getBytes(),reference.equals(sequence)); + haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); + haplotype.setCigar(Civar.fromCharSequence(cigar).toCigar(reference.length())); + return haplotype; + } + + private String applyCigar(final String reference, final String cigar, final int offset, final boolean global) { + final Matcher pm = cigarPattern.matcher(cigar); + StringBuffer sb = new StringBuffer(); + int index = offset; + while (pm.find()) { + int length = Integer.valueOf(pm.group(1)); + char operator = pm.group(2).charAt(0); + switch (operator) { + case '=' : + try { + sb.append(reference.substring(index, index + length)); + } catch (Exception e) { + throw new RuntimeException(" " + index + " " + (index + length) + " " + reference.length() + " " + cigar,e); + } + index += length; break; + case 'D' : + index += length; break; + case 'I' : + String insert = cigar.substring(pm.end(),pm.end() + length).toUpperCase(); + sb.append(insert); break; + case 'V' : + sb.append(transversionV(reference.charAt(index))); index++; break; + case 'W' : + sb.append(transversionW(reference.charAt(index))); index++; break; + case 'T' : + sb.append(transition(reference.charAt(index))); index++; break; + default: + throw new UnsupportedOperationException("cigar operator " + operator + " not supported."); + } + } + if (global && index != reference.length()) { + throw new RuntimeException(" haplotype cigar does not explain reference length (" + index + " != " + reference.length() + ") on cigar " + cigar); + } else if (index > reference.length()) { + throw new RuntimeException(" index beyond end "); + } + return sb.toString(); + } + + protected int kmerSize() { + return kmerSize; + } + + private char transversionV(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'C'; + case 'G': return 'T'; + case 'C': return 'A'; + case 'T': return 'G'; + default: + return c; + } + + } + + private char transversionW(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'T'; + case 'G': return 'C'; + case 'T': return 'A'; + case 'C': return 'G'; + default: + return c; + } + + } + + private char transition(final char c) { + switch (Character.toUpperCase(c)) { + case 'A': return 'G'; + case 'G': return 'A'; + case 'T': return 'C'; + case 'C': return 'T'; + default: + return c; + } + + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java index 25aedb149..3f0cb94f3 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ActiveRegionTestDataSetUnitTest.java @@ -55,7 +55,6 @@ import com.google.caliper.Param; import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.DataProvider; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java index a8a90f37a..b1174e22a 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/AssemblyResultSetUnitTest.java @@ -60,7 +60,6 @@ import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.RandomDNA; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/FastLoglessPairHMMUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/FastLoglessPairHMMUnitTest.java new file mode 100644 index 000000000..8773bbc63 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/FastLoglessPairHMMUnitTest.java @@ -0,0 +1,183 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; +import org.broadinstitute.gatk.utils.pairhmm.PairHMMReadyHaplotypes; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.*; + + +/** + * Created with IntelliJ IDEA. + * User: valentin + * Date: 10/13/13 + * Time: 12:55 PM + * To change this template use File | Settings | File Templates. + */ +public class FastLoglessPairHMMUnitTest extends ActiveRegionTestDataSetUnitTest { + + private FastLoglessPairHMM unsorted = new FastLoglessPairHMM((byte)10); + private FastLoglessPairHMM sorted = new FastLoglessPairHMM((byte)10); + + @Test(enabled=false,dataProvider="activeRegionTestDataSets") + public void testActiveRegionsDataSet(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + + } + + @Test(enabled=true,dataProvider="activeRegionTestDataSets") + public void testHaplotypeGrouped(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + final List reads = as.readList(); + final List haplotypes = as.haplotypeList(); + PairHMMReadyHaplotypes haplotypeCollection = new PairHMMReadyHaplotypes(haplotypes.size()); + final List sortedHaplotypes = new ArrayList<>(haplotypes); + Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); + Map basesToPos = new HashMap<>(sortedHaplotypes.size()); + int nextIdx = 0; + + for (final Haplotype h : sortedHaplotypes) { + final byte[] bases = h.getBases(); + haplotypeCollection.add(bases); + basesToPos.put(bases,nextIdx++); + } + for (GATKSAMRecord read : reads) { + final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; + final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; + unsorted.loadRead(read); + sorted.loadRead(read); + final Map unsortedResults = new HashMap<>(haplotypes.size()); + for (int i = 0; i < sortedHaplotypes.size(); i++) { + final Haplotype h = sortedHaplotypes.get(i); + final byte[] haplotypeBases = h.getBases().clone(); + unsorted.loadHaplotypeBases(haplotypeBases); + double lk = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + unsortedLikelihoods[i] = lk; + } + sorted.calculateLocalLikelihoods(0, read.getReadLength(), haplotypeCollection); + for (final PairHMMReadyHaplotypes.Entry entry : haplotypeCollection) { + final byte[] bases = entry.getBases(); + final double lk = entry.getLikelihood(); + final int haplotypePos = basesToPos.get(bases); + sortedLikelihoods[haplotypePos] = lk; + } + for (int i = 0; i < unsortedLikelihoods.length; i++) + Assert.assertEquals(unsortedLikelihoods[i],sortedLikelihoods[i],0.00000001,Arrays.toString(unsortedLikelihoods) + Arrays.toString(sortedLikelihoods)); + } + } + + @Test(enabled=true,dataProvider="activeRegionTestDataSets") + public void testSortedVsUnsorted(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { + final List reads = as.readList(); + final List haplotypes = as.haplotypeList(); + final List sortedHaplotypes = new ArrayList<>(haplotypes); + Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); + + byte[] lastHaplotypeBases = null; + for (GATKSAMRecord read : reads) { + final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; + final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; + unsorted.loadRead(read); + sorted.loadRead(read); + for (int i = 0; i < sortedHaplotypes.size(); i++) { + final Haplotype h = sortedHaplotypes.get(i); + final byte[] haplotypeBases = h.getBases().clone(); + final byte[] haplotypeBases2 = haplotypeBases.clone(); + int commonPrefixEnd = 0; + + + if (lastHaplotypeBases != null) { + final int prefixEndLimit = Math.min(lastHaplotypeBases.length,haplotypeBases.length); + for (commonPrefixEnd = 0; commonPrefixEnd < prefixEndLimit; commonPrefixEnd++) + if (lastHaplotypeBases[commonPrefixEnd] != haplotypeBases[commonPrefixEnd]) + break; + } + + unsorted.loadHaplotypeBases(haplotypeBases); + sorted.changeHaplotypeSuffix(commonPrefixEnd, haplotypeBases, commonPrefixEnd, haplotypeBases.length); + Assert.assertTrue(Arrays.equals(haplotypeBases2, unsorted.getHaplotypeBases())); + Assert.assertTrue(Arrays.equals(haplotypeBases2, sorted.getHaplotypeBases())); + unsortedLikelihoods[i] = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + sortedLikelihoods[i] = sorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); + Assert.assertTrue(Arrays.equals(haplotypeBases2,unsorted.getHaplotypeBases())); + Assert.assertTrue(Arrays.equals(haplotypeBases2,sorted.getHaplotypeBases())); + Assert.assertEquals((double)unsortedLikelihoods[i], (double) sortedLikelihoods[i],0.00000001); + lastHaplotypeBases = haplotypeBases; + } + } + } + + public static final Comparator HAPLOTYPE_COMPARATOR = new Comparator() { + + @Override + public int compare(final Haplotype o1, final Haplotype o2) { + if (o1 == o2) + return 0; + final byte[] bases1 = o1.getBases(); + final byte[] bases2 = o2.getBases(); + final int ilimit = Math.min(bases1.length,bases2.length); + for (int i = 0; i < ilimit; i++) { + final int cmp = Byte.compare(bases1[i],bases2[i]); + if (cmp != 0) return cmp; + } + if (bases1.length == bases2.length) return 0; + return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. + } + }; + + + + +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java index e898829ad..258db39e9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HCLikelihoodCalculationEnginesBenchmark.java @@ -53,8 +53,7 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import com.google.caliper.Param; import com.google.caliper.SimpleBenchmark; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeBaseComparatorUnitTest.java new file mode 100644 index 000000000..dfd91ab24 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeBaseComparatorUnitTest.java @@ -0,0 +1,84 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.haplotype.HaplotypeBaseComparator; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeBaseComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + final List rawStrings = Arrays.asList("A", "C", "AC", "CT", "GTC", "ACGT"); + final List lexStrings = new ArrayList(rawStrings); + Collections.sort(lexStrings); + + for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { + final List haps = new ArrayList(seqs.size()); + for ( final String seq : seqs ) { + haps.add(new Haplotype(seq.getBytes(), false)); + } + + Collections.sort(haps, new HaplotypeBaseComparator()); + for ( int i = 0; i < lexStrings.size(); i++ ) + Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 14dd1cf76..9d00c1d0c 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -60,7 +60,7 @@ import org.broadinstitute.gatk.engine.walkers.WalkerTest; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculatorUnitTest.java new file mode 100644 index 000000000..2ef8b7332 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeLDCalculatorUnitTest.java @@ -0,0 +1,123 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class HaplotypeLDCalculatorUnitTest extends BaseTest { + HaplotypeLDCalculator calculator; + + @BeforeMethod + public void setUp() throws Exception { + calculator = new HaplotypeLDCalculator(); + } + + /** + * Tests that we get the right values from the R^2 calculation + */ + @Test + public void computeProbOfBeingPhased() { + logger.warn("Executing testCalculateR2LD"); + + // See AA, AB, and BA in population + Assert.assertEquals(calculator.pPhasedTest(0, 0, 0, -100), 0, 0.00001); + + // See AA, AB, BB in population + Assert.assertTrue(calculator.pPhasedTest(0, 0, -100, 0) < 0.5); + + // See AA and BB in population + Assert.assertEquals(calculator.pPhasedTest(0, -100, -100, 0), 1, 0.00001); + + // See AA, AB, and BA but no BBs in population + Assert.assertEquals(calculator.pPhasedTest(0, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // See BB, AB, and BA but no AAs in population, so BB is the best explanation + Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, 0), 1, 0.00001); + + // See only AB and BA but no AAs nor BBs in population + Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // Previously bad input + Assert.assertEquals(calculator.pPhasedTest(-400, -600, -1200, Double.NEGATIVE_INFINITY), 0, 0.00001); + + // first variant is just bad, so BA and BB are both very bad, shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -1000, -100, -10000), 0, 0.00001); + + // second variant is just bad, so AB and BB are both very bad, shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -100, -1000, -10000), 0, 0.00001); + + // AA is very good, all all others are quite poor. Shouldn't be phased + Assert.assertEquals(calculator.pPhasedTest(0, -1000, -1000, -10000), 0, 0.00001); + + + for ( int i = -10; i > -10000; i -= 10 ) { + // only bad het states + Assert.assertTrue(calculator.pPhasedTest(0, i, i, 0) > 0.99, "Failed for " + i); + + // BB state is terrible + Assert.assertTrue(calculator.pPhasedTest(0, 0, 0, i) < 0.5, "Failed for " + i); + + // truth is AB, BA, and BB + Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, 0) < 0.5, "Failed for " + i); + + // truth is AB, BA + Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, i) < 0.5, "Failed for " + i); + + // Only good signal is AB, so we shouldn't be phased + Assert.assertTrue(calculator.pPhasedTest(i, i, 0, i) < 0.5, "Failed for " + i); + Assert.assertTrue(calculator.pPhasedTest(i, 0, i, i) < 0.5, "Failed for " + i); + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeScoreComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeScoreComparatorUnitTest.java new file mode 100644 index 000000000..b137c3c20 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeScoreComparatorUnitTest.java @@ -0,0 +1,83 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.haplotype.HaplotypeScoreComparator; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class HaplotypeScoreComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + final List scores = Arrays.asList(3.0, 2.0, 1.0); + for ( final List myScores : Utils.makePermutations(scores, scores.size(), false) ) { + final List haps = new ArrayList(myScores.size()); + for ( final double score : myScores ) { + final Haplotype h = new Haplotype("ACT".getBytes(), false); + h.setScore(score); + haps.add(h); + } + + Collections.sort(haps, new HaplotypeScoreComparator()); + for ( int i = 0; i < myScores.size(); i++ ) + Assert.assertEquals(haps.get(i).getScore(), scores.get(i)); + } + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeSizeAndBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeSizeAndBaseComparatorUnitTest.java new file mode 100644 index 000000000..593b3a833 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeSizeAndBaseComparatorUnitTest.java @@ -0,0 +1,89 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.haplotype.HaplotypeSizeAndBaseComparator; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +/** + * User: btaylor + * Date: 8/1/13 + * Time: 11:09 AM + */ +public class HaplotypeSizeAndBaseComparatorUnitTest extends BaseTest { + @Test + public void testComparison() { + // desired ordering is by size first, subordered by lexacographic relationship between bases + final List rawStrings = Arrays.asList("A", "C", "AC", "CC", "CT", "AAT", "ACT", "GAT", "ACGT"); + final List lexStrings = new ArrayList<>(rawStrings); + + for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { + final List haps = new ArrayList<>(seqs.size()); + for ( final String seq : seqs ) { + haps.add(new Haplotype(seq.getBytes(), false)); + } + + Collections.sort(haps, new HaplotypeSizeAndBaseComparator()); + for ( int i = 0; i < lexStrings.size(); i++ ) + Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); + } + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMergerUnitTest.java new file mode 100644 index 000000000..68c6bea73 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/LDMergerUnitTest.java @@ -0,0 +1,341 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import htsjdk.samtools.TextCigarCodec; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.broadinstitute.gatk.utils.haplotype.EventMap; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.TreeSet; + +public class LDMergerUnitTest extends BaseTest { + LDMerger merger; + GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + } + + @BeforeMethod + public void setUp() throws Exception { + merger = new LDMerger(); + } + + @Test + public void testCreateMergedVariantContext() { + logger.warn("Executing testCreateMergedVariantContext"); + + final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes(); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); + + // SNP + SNP = simple MNP + VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make(); + VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make(); + VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + ref + SNP = MNP with ref base gap + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + SNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // SNP + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + deletion = MNP + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + deletion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // insertion + insertion + thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + deletion + thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); + nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); + truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // deletion + insertion (abutting) + thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); + nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make(); + truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + + // complex + complex + thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make(); + nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make(); + truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make(); + mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + logger.warn(truthVC + " == " + mergedVC); + Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); + Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); + Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); + } + + @Test + public void testInsertionDeletionBecomingNullAllele() { + final byte[] ref = "CAAA".getBytes(); + final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); + + // insertion + deletion results in a null allele, should return false + final VariantContext thisVC = new VariantContextBuilder().loc("2", 1700, 1701).alleles("CA","C").make(); + final VariantContext nextVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("A","AA").make(); + final VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); + Assert.assertNull(mergedVC, "Insertion deletion becoming a null allele should return a null variant context"); + } + + /** + * Just returns a given R2 value for testing + */ + private static class MockLDCalculator extends HaplotypeLDCalculator { + private final double R2; + + private MockLDCalculator(double r2) { + R2 = r2; + } + + @Override + protected double computeProbOfBeingPhased(VariantContext first, VariantContext second) { + return R2; + } + } + + @DataProvider(name = "R2MergerData") + public Object[][] makeR2MergerData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + final double thres = LDMerger.MERGE_EVENTS_PROB_PHASED_THRESHOLD; + for ( final double r2 : Arrays.asList(0.0, thres - 0.01, thres + 0.01, 1.0) ) { + tests.add(new Object[]{"ACGT", "CCGC", 2, "4M", "ACGT", "CCGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "AGGC", 2, "4M", "CGT", "GGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "ACCC", 2, "4M", "GT", "CC", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "ACCGTT", 2, "2M1I1M1I1M", "CG", "CCGT", r2, r2 >= thres}); + tests.add(new Object[]{"ACGT", "AGCT", 2, "4M", "CG", "GC", r2, r2 >= thres}); + tests.add(new Object[]{"ACAGT", "AAGC", 2, "1M1D3M", "ACAGT", "AAGC", r2, r2 >= thres}); + tests.add(new Object[]{"ACAGT", "AAT", 2, "1M1D1M1D1M", "ACAG", "AA", r2, r2 >= thres}); + + // cannot be merged -- only 1 event + tests.add(new Object[]{"AAA", "ACA", 1, "3M", null, null, r2, false}); + + final int dist = LDMerger.MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE + 2; + tests.add(new Object[]{Utils.dupString("A", dist), "C" + Utils.dupString("A", dist - 2) + "C", 2, dist + "M", null, null, r2, false}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "R2MergerData") + public void testR2Merger(final String refS, final String hapS, int nEvents, final String cigar, final String expectedMergedRef, final String expectedMergedAlt, final double r2, final boolean expectMerge) { + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); + final Haplotype hap = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + + final List haplotypes = Arrays.asList(ref, hap); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(r2); + + Assert.assertEquals(vcStarts.size(), nEvents); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, expectMerge); + Assert.assertEquals(vcStarts.size(), expectMerge ? 1 : nEvents); + if ( expectMerge ) { + final VariantContext vc = hap.getEventMap().getVariantContexts().iterator().next(); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertEquals(vc.getReference().getDisplayString(), expectedMergedRef); + Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), expectedMergedAlt); + } + } + + @Test + public void testR2MergerWithThirdHapWithoutEvent() { + final String refS = "ACGT"; + final String hapS = "CCGA"; + final String cigar = "4M"; + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); + final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final Haplotype hap2 = new Haplotype("ACGA".getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + + final List haplotypes = Arrays.asList(ref, hap1, hap2); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(1.0); + + Assert.assertEquals(vcStarts.size(), 2); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, true); + Assert.assertEquals(vcStarts.size(), 1); + + final VariantContext vc = hap1.getEventMap().getVariantContexts().iterator().next(); + Assert.assertTrue(vc.isBiallelic()); + Assert.assertEquals(vc.getReference().getDisplayString(), "ACGT"); + Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), "CCGA"); + + Assert.assertEquals(hap2.getEventMap().size(), 0); + } + + @Test + public void testR2MergerWithMultipleAllelesAtSites() { + final String refS = "ACGT"; + final String hapS = "TCGA"; + final String cigar = "4M"; + final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); + final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + + final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); + for (final String hap2S : Arrays.asList("GCGA", "TCGG")) { + final Haplotype hap2 = new Haplotype(hap2S.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); + + final List haplotypes = Arrays.asList(ref, hap1, hap2); + final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); + final MockLDCalculator r2Calc = new MockLDCalculator(1.0); + + Assert.assertEquals(vcStarts.size(), 2); + final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); + Assert.assertEquals(merged, false); + Assert.assertEquals(vcStarts.size(), 2); + } + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java index cb1f31a84..0040dd7bf 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngineUnitTest.java @@ -61,8 +61,8 @@ import org.broadinstitute.gatk.utils.BaseTest; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.pairhmm.PairHMM; -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatCovariate; -import org.broadinstitute.gatk.utils.recalibration.covariates.RepeatLengthCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatCovariate; +import org.broadinstitute.gatk.engine.recalibration.covariates.RepeatLengthCovariate; import htsjdk.variant.variantcontext.*; import org.testng.Assert; import org.testng.annotations.BeforeSuite; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMProbabilityBugIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMProbabilityBugIntegrationTest.java new file mode 100644 index 000000000..4ec4c4cbf --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/PairHMMProbabilityBugIntegrationTest.java @@ -0,0 +1,92 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.haplotypecaller; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; + +/** + * Test for the Prob > 1 bug in PairHMM using callers. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class PairHMMProbabilityBugIntegrationTest extends WalkerTest { + + private static final File REFERENCE = new File("/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta").getAbsoluteFile(); + private static final File BAM = new File (privateTestDir, "pairhmm_prob_bug.bam").getAbsoluteFile(); + private static final File INTERVAL = new File (privateTestDir, "pairhmm_prob_bug.interval.bed").getAbsoluteFile(); + + private static final File UG_BAM = new File(privateTestDir, "pairhmm_prob_bug.ug.bam").getAbsoluteFile(); + private static final File UG_INTERVAL = new File(privateTestDir, "pairhmm_prob_bug.ug.intervals.bed").getAbsoluteFile(); + + + @Test + public void testHaplotypeCaller() { + final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s", + REFERENCE,BAM,INTERVAL); + final String name = getClass().getSimpleName() + ".testHaplotypeCaller"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } + + @Test + public void testUnifiedGenotyper() { + final String commandLine = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -dcov 200 -glm INDEL", + REFERENCE,UG_BAM,UG_INTERVAL); + final String name = getClass().getSimpleName() + ".testUnifiedGenotyper"; + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); + executeTest(name, spec); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java index 2c702d0b1..2ce464dd2 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReadThreadingLikelihoodCalculationEngineUnitTest.java @@ -52,15 +52,12 @@ package org.broadinstitute.gatk.tools.walkers.haplotypecaller; import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleListUtils; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.HaplotypeGraph; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pairhmm.ActiveRegionTestDataSet; -import org.broadinstitute.gatk.utils.pairhmm.FastLoglessPairHMM; -import org.broadinstitute.gatk.utils.pairhmm.FlexibleHMM; -import org.broadinstitute.gatk.utils.pairhmm.PairHMM; +import org.broadinstitute.gatk.utils.pairhmm.*; import org.broadinstitute.gatk.utils.sam.ClippedGATKSAMRecord; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.testng.Assert; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index 6eb207bf1..f7a4059e0 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -60,6 +60,8 @@ import org.broadinstitute.gatk.tools.walkers.genotyper.*; import org.broadinstitute.gatk.utils.*; import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleListUtils; import org.broadinstitute.gatk.utils.haplotype.Haplotype; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java index 13538d304..964d6c151 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalWalkerUnitTest.java @@ -55,9 +55,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval; // the imports for unit testing. import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.manager.StratificationManager; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index e47b26758..3d5463d0f 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -51,9 +51,8 @@ package org.broadinstitute.gatk.tools.walkers.variantrecalibration; +import org.broadinstitute.gatk.utils.variant.VCIterable; import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFCodec; import org.testng.Assert; @@ -340,7 +339,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { final List outputFiles = executeTest("testApplyRecalibrationSnpAndIndelTogether", spec).getFirst(); setPDFsForDeletion(outputFiles); final File VCF = outputFiles.get(0); - for( final VariantContext VC : GATKVCFUtils.readAllVCs(VCF, new VCFCodec()).getSecond() ) { + for( final VariantContext VC : VCIterable.readAllVCs(VCF, new VCFCodec()).getSecond() ) { if( VC != null ) { Assert.assertTrue(VC.isNotFiltered()); // there should only be unfiltered records in the output VCF file } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java index 891ad8d38..8e176089d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineGVCFsIntegrationTest.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.VariantContext; import org.testng.Assert; import org.testng.annotations.Test; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java index a301adc02..d3a145be6 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariantsUnitTest.java @@ -52,7 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import htsjdk.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import htsjdk.variant.vcf.VCFCodec; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java index f72bfc415..b9cfc0949 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariantsUnitTest.java @@ -51,29 +51,22 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.io.File; -import java.io.FileNotFoundException; import java.util.*; /** diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java new file mode 100644 index 000000000..08bb0b4a2 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantContextMergerUnitTest.java @@ -0,0 +1,280 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.variantutils; + +import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Tests {@link org.broadinstitute.gatk.tools.walkers.variantutils.ReferenceConfidenceVariantContextMerger}. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class VariantContextMergerUnitTest extends BaseTest { + Allele Aref, T, C, G, Cref, ATC, ATCATC; + Allele ATCATCT; + Allele ATref; + Allele Anoref; + Allele GT; + + private GenomeLocParser genomeLocParser; + + @BeforeSuite + public void setup() throws IOException { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + ATCATCT = Allele.create("ATCATCT"); + ATref = Allele.create("AT",true); + Anoref = Allele.create("A",false); + GT = Allele.create("GT",false); + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); + } + + @Test(dataProvider = "referenceConfidenceMergeData") + public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { + final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true); + if ( result == null ) { + Assert.assertTrue(expectedResult == null); + return; + } + Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID); + Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID); + for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { + Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); + // use string comparisons to test equality for now + Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); + } + } + + @Test + public void testGenerateADWithNewAlleles() { + + final int[] originalAD = new int[] {1,2,0}; + final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2}; + + final int[] newAD = ReferenceConfidenceVariantContextMerger.generateAD(originalAD, indexesOfRelevantAlleles); + Assert.assertEquals(newAD, new int[]{1,2,0,0}); + } + + + @Test(expectedExceptions = UserException.class) + public void testGetIndexesOfRelevantAllelesWithNoALT() { + + final List alleles1 = new ArrayList<>(1); + alleles1.add(Allele.create("A", true)); + final List alleles2 = new ArrayList<>(1); + alleles2.add(Allele.create("A", true)); + ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); + Assert.fail("We should have thrown an exception because the allele was not present"); + } + + @Test(dataProvider = "getIndexesOfRelevantAllelesData") + public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { + final List myAlleles = new ArrayList<>(3); + + // always add the reference and alleles + myAlleles.add(allAlleles.get(0)); + myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + // optionally add another alternate allele + if ( allelesIndex > 0 ) + myAlleles.add(allAlleles.get(allelesIndex)); + + final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); + + Assert.assertEquals(indexes.length, allAlleles.size()); + + for ( int i = 0; i < allAlleles.size(); i++ ) { + if ( i == 0 ) + Assert.assertEquals(indexes[i], 0); // ref should always match + else if ( i == allelesIndex ) + Assert.assertEquals(indexes[i], 2); // allele + else + Assert.assertEquals(indexes[i], 1); // + } + } + + + @DataProvider(name = "referenceConfidenceMergeData") + public Object[][] makeReferenceConfidenceMergeData() { + final List tests = new ArrayList<>(); + final int start = 10; + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); + final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); + + final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; + final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; + + final List noCalls = new ArrayList<>(2); + noCalls.add(Allele.NO_CALL); + noCalls.add(Allele.NO_CALL); + + final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); + final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + final Allele AAref = Allele.create("AA", true); + final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); + final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + final List A_C = Arrays.asList(Aref, C); + final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); + final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + final List A_C_G = Arrays.asList(Aref, C, G); + final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); + final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); + final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + final Allele A = Allele.create("A", false); + final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); + final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + + // first test the case of a single record + tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); + + // now, test pairs: + // a SNP with another SNP + tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with an indel + tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); + // a SNP with 2 SNPs + tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); + // a SNP with a ref record + tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); + + // spanning records: + // a SNP with a spanning ref record + tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); + // a SNP with a spanning deletion + tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); + + // combination of all + tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()}); + + // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts + tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), + + loc, false, + null}); + tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), + loc, true, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); + + final Object[][] result = tests.toArray(new Object[][]{}); + return result; + } + @DataProvider(name = "getIndexesOfRelevantAllelesData") + public Object[][] makeGetIndexesOfRelevantAllelesData() { + final int totalAlleles = 5; + final List alleles = new ArrayList<>(totalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < totalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + + final List tests = new ArrayList<>(); + + for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { + tests.add(new Object[]{alleleIndex, alleles}); + } + + return tests.toArray(new Object[][]{}); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java index 56a07058f..2dafcb70d 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IndexedSetUnitTest.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.utils.collections; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -68,7 +68,7 @@ public class IndexedSetUnitTest { @Test(dataProvider = "initialCapacityElementCountMaxElementData") public void testCompositionBySingleElementAddition(final int initialCapacity, final int elementCount, final int maxElement) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final IndexedSet subject = new IndexedSet<>(initialCapacity); final Set elementSet = new LinkedHashSet<>(); @@ -111,7 +111,7 @@ public class IndexedSetUnitTest { } private List generateElementCollection(final int elementCount, final int maxElement) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final List elementList = new ArrayList<>(elementCount); for (int i = 0; i < elementCount; i++) @@ -163,7 +163,7 @@ public class IndexedSetUnitTest { final IndexedSet subject = new IndexedSet<>(elementList); final Set elementSet = new LinkedHashSet<>(elementList); final int removeCount = (subject.size() + 1) / 2; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < removeCount; i++) { final int removeIndex = rnd.nextInt(subject.size()); final int removeElement = subject.get(removeIndex); @@ -181,7 +181,7 @@ public class IndexedSetUnitTest { final IndexedSet subject = new IndexedSet<>(elementList); final Set elementSet = new LinkedHashSet<>(elementList); final int removeCount = subject.size(); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < removeCount; i++) { final int removeIndex = rnd.nextInt(subject.size()); final int removeElement = subject.get(removeIndex); @@ -211,7 +211,7 @@ public class IndexedSetUnitTest { final IndexedSet subject = new IndexedSet<>(elementList); final Set elementSet = new LinkedHashSet<>(elementList); final int removeCount = subject.size(); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < removeCount; i++) { final int removeIndex = rnd.nextInt(subject.size()); final int removeElement = subject.get(removeIndex); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java index 03f19491f..aeab35ad4 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/collections/IntMaxHeapUnitTest.java @@ -51,7 +51,7 @@ package org.broadinstitute.gatk.utils.collections; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.Utils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -73,7 +73,7 @@ public class IntMaxHeapUnitTest { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -85,7 +85,7 @@ public class IntMaxHeapUnitTest { public void testEmptynessAndSize(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); Assert.assertEquals(heap.size(),0); Assert.assertTrue(heap.isEmpty()); @@ -101,7 +101,7 @@ public class IntMaxHeapUnitTest { public void testClear(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -118,7 +118,7 @@ public class IntMaxHeapUnitTest { final IntMaxHeap addHeap = new IntMaxHeap(initialCapacity); final IntMaxHeap arrayAddHeap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final int[] values = new int[elementCount]; for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -135,7 +135,7 @@ public class IntMaxHeapUnitTest { public void testRemove(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); final List values = new ArrayList<>(elementCount); for (int i = 0; i < elementCount; i++) { final int v = rnd.nextInt(); @@ -154,7 +154,7 @@ public class IntMaxHeapUnitTest { public void testPeek(final int initialCapacity, final int elementCount) { final IntMaxHeap heap = new IntMaxHeap(initialCapacity); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); int top = rnd.nextInt(); heap.add(top); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffEngineUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffEngineUnitTest.java new file mode 100644 index 000000000..e00ac7e8e --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffEngineUnitTest.java @@ -0,0 +1,259 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffEngine; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.Difference; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffEngineUnitTest extends BaseTest { + DiffEngine engine; + + @BeforeClass(enabled = true) + public void createDiffEngine() { + engine = new DiffEngine(); + } + + // -------------------------------------------------------------------------------- + // + // Difference testing routines + // + // -------------------------------------------------------------------------------- + + private class DifferenceTest extends TestDataProvider { + public DiffElement tree1, tree2; + public List differences; + + private DifferenceTest(String tree1, String tree2) { + this(tree1, tree2, Collections.emptyList()); + } + + private DifferenceTest(String tree1, String tree2, String difference) { + this(tree1, tree2, Arrays.asList(difference)); + } + + private DifferenceTest(String tree1, String tree2, List differences) { + super(DifferenceTest.class); + this.tree1 = DiffNode.fromString(tree1); + this.tree2 = DiffNode.fromString(tree2); + this.differences = differences; + } + + public String toString() { + return String.format("tree1=%s tree2=%s diff=%s", + tree1.toOneLineString(), tree2.toOneLineString(), differences); + } + } + + @DataProvider(name = "trees") + public Object[][] createTrees() { + new DifferenceTest("A=X", "A=X"); + new DifferenceTest("A=X", "A=Y", "A:X!=Y"); + new DifferenceTest("A=X", "B=X", Arrays.asList("A:X!=MISSING", "B:MISSING!=X")); + new DifferenceTest("A=(X=1)", "B=(X=1)", Arrays.asList("A:(X=1)!=MISSING", "B:MISSING!=(X=1)")); + new DifferenceTest("A=(X=1)", "A=(X=1)"); + new DifferenceTest("A=(X=1 Y=2)", "A=(X=1 Y=2)"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=3))"); + new DifferenceTest("A=(X=1)", "A=(X=2)", "A.X:1!=2"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2 B=(Z=4))", "A.B.Z:3!=4"); + new DifferenceTest("A=(X=1)", "A=(X=1 Y=2)", "A.Y:MISSING!=2"); + new DifferenceTest("A=(X=1 Y=2 B=(Z=3))", "A=(X=1 Y=2)", "A.B:(Z=3)!=MISSING"); + return DifferenceTest.getTests(DifferenceTest.class); + } + + @Test(enabled = true, dataProvider = "trees") + public void testDiffs(DifferenceTest test) { + logger.warn("Test tree1: " + test.tree1.toOneLineString()); + logger.warn("Test tree2: " + test.tree2.toOneLineString()); + + List diffs = engine.diff(test.tree1, test.tree2); + logger.warn("Test expected diff : " + test.differences); + logger.warn("Observed diffs : " + diffs); + } + + // -------------------------------------------------------------------------------- + // + // Low-level routines for summarizing differences + // + // -------------------------------------------------------------------------------- + + @Test(enabled = true) + public void testLongestCommonPostfix() { + testLongestCommonPostfixHelper("A", "A", 1); + testLongestCommonPostfixHelper("A", "B", 0); + testLongestCommonPostfixHelper("A.B", "A.B", 2); + testLongestCommonPostfixHelper("A.B.C", "A.B.C", 3); + testLongestCommonPostfixHelper("A.B.C", "X.B.C", 2); + testLongestCommonPostfixHelper("A.B.C", "X.Y.C", 1); + testLongestCommonPostfixHelper("A.B.C", "X.Y.Z", 0); + testLongestCommonPostfixHelper("A.B.C", "A.X.C", 1); + testLongestCommonPostfixHelper("A.B.C", "A.X.Z", 0); + testLongestCommonPostfixHelper("A.B.C", "A.B.Z", 0); + } + + public void testLongestCommonPostfixHelper(String p1, String p2, int expected) { + String[] parts1 = p1.split("\\."); + String[] parts2 = p2.split("\\."); + int obs = DiffEngine.longestCommonPostfix(parts1, parts2); + Assert.assertEquals(obs, expected, "p1=" + p1 + " p2=" + p2 + " failed"); + } + + @Test(enabled = true, dependsOnMethods = "testLongestCommonPostfix") + public void testSummarizePath() { + testSummarizePathHelper("A", "A", "A"); + testSummarizePathHelper("A", "B", "*"); + testSummarizePathHelper("A.B", "A.B", "A.B"); + testSummarizePathHelper("A.B", "X.B", "*.B"); + testSummarizePathHelper("A.B", "X.Y", "*.*"); + testSummarizePathHelper("A.B.C", "A.B.C", "A.B.C"); + testSummarizePathHelper("A.B.C", "X.B.C", "*.B.C"); + testSummarizePathHelper("A.B.C", "X.Y.C", "*.*.C"); + testSummarizePathHelper("A.B.C", "X.Y.Z", "*.*.*"); + testSummarizePathHelper("A.B.C", "A.X.C", "*.*.C"); + testSummarizePathHelper("A.B.C", "A.X.Z", "*.*.*"); + testSummarizePathHelper("A.B.C", "A.B.Z", "*.*.*"); + } + + public void testSummarizePathHelper(String p1, String p2, String expected) { + String[] parts1 = DiffEngine.diffNameToPath(p1); + String[] parts2 = DiffEngine.diffNameToPath(p2); + int obs = DiffEngine.longestCommonPostfix(parts1, parts2); + String path = DiffEngine.summarizedPath(parts2, obs); + Assert.assertEquals(path, expected, "p1=" + p1 + " p2=" + p2 + " failed"); + } + + // -------------------------------------------------------------------------------- + // + // High-level difference summary + // + // -------------------------------------------------------------------------------- + + private class SummarizeDifferenceTest extends TestDataProvider { + List diffs = new ArrayList(); + List expecteds = new ArrayList(); + + public SummarizeDifferenceTest() { super(SummarizeDifferenceTest.class); } + + public SummarizeDifferenceTest addDiff(String... diffsToAdd) { + diffs.addAll(Arrays.asList(diffsToAdd)); + return this; + } + + public SummarizeDifferenceTest addSummary(String... expectedSummary) { + expecteds.addAll(Arrays.asList(expectedSummary)); + return this; + } + + public String toString() { + return String.format("diffs=%s => expected=%s", diffs, expecteds); + } + + public void test() { + List diffPaths = new ArrayList(diffs.size()); + for ( String diff : diffs ) { diffPaths.add(DiffEngine.diffNameToPath(diff)); } + + List sumDiffs = engine.summarizedDifferencesOfPathsFromString(diffs); + + Assert.assertEquals(sumDiffs.size(), expecteds.size(), "Unexpected number of summarized differences: " + sumDiffs); + + for ( int i = 0; i < sumDiffs.size(); i++ ) { + Difference sumDiff = sumDiffs.get(i); + String expected = expecteds.get(i); + String[] pathCount = expected.split(":"); + String path = pathCount[0]; + int count = Integer.valueOf(pathCount[1]); + Assert.assertEquals(sumDiff.getPath(), path, "Unexpected path at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); + Assert.assertEquals(sumDiff.getCount(), count, "Unexpected counts at: " + expected + " obs=" + sumDiff + " all=" + sumDiffs); + } + } + } + + @DataProvider(name = "summaries") + public Object[][] createSummaries() { + new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + new SummarizeDifferenceTest().addDiff("A", "A", "A").addSummary("A:3"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B").addSummary("A:3", "B:1"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B").addSummary("A:3", "B:2"); + new SummarizeDifferenceTest().addDiff("A", "A", "A", "B", "B", "C").addSummary("A:3", "B:2", "C:1"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X").addSummary("A.X:2"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X").addSummary("*.X:3", "A.X:2", "B.X:1"); + new SummarizeDifferenceTest().addDiff("A.X", "A.X", "B.X", "B.X").addSummary("*.X:4", "A.X:2", "B.X:2"); + new SummarizeDifferenceTest().addDiff("A.B.C", "X.B.C").addSummary("*.B.C:2", "A.B.C:1", "X.B.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "X.Y.C", "X.Y.C").addSummary("*.*.C:3", "X.Y.C:2", "A.B.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "X.Y.C").addSummary("*.*.C:3", "A.B.C:1", "A.X.C:1", "X.Y.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C").addSummary("*.*.C:3", "*.X.C:2", "A.B.C:1", "A.X.C:1", "B.X.C:1"); + new SummarizeDifferenceTest().addDiff("A.B.C", "A.X.C", "B.X.C", "B.X.C").addSummary("*.*.C:4", "*.X.C:3", "B.X.C:2", "A.B.C:1", "A.X.C:1"); + + return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + } + + + @Test(enabled = true, dependsOnMethods = "testSummarizePath", dataProvider = "summaries") + public void testSummarizeDifferences(SummarizeDifferenceTest test) { + test.test(); + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffNodeUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffNodeUnitTest.java new file mode 100644 index 000000000..38252223a --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffNodeUnitTest.java @@ -0,0 +1,278 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.DiffValue; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffNodeUnitTest extends BaseTest { + // Data is: + // MY_ROOT + // fields: A=A, B=B + // nodes: C, D + // C: fields: E=E, nodes: none + // D: fields: F=F, G=G, nodes: none + static DiffNode MY_ROOT = DiffNode.rooted("MY_ROOT"); + static DiffValue Value_A = new DiffValue("A", MY_ROOT, "A"); + static DiffValue Value_B = new DiffValue("B", MY_ROOT, "B"); + static DiffNode NODE_C = DiffNode.empty("C", MY_ROOT); + static DiffNode NODE_D = DiffNode.empty("D", MY_ROOT); + static DiffValue Value_E = new DiffValue("E", NODE_C, "E"); + static DiffValue Value_F = new DiffValue("F", NODE_D, "F"); + static DiffValue Value_G = new DiffValue("G", NODE_D, "G"); + + static { + MY_ROOT.add(Value_A); + MY_ROOT.add(Value_B); + MY_ROOT.add(NODE_C); + MY_ROOT.add(NODE_D); + NODE_C.add(Value_E); + NODE_D.add(Value_F); + NODE_D.add(Value_G); + } + + + // -------------------------------------------------------------------------------- + // + // Element testing routines + // + // -------------------------------------------------------------------------------- + + private class ElementTest extends TestDataProvider { + public DiffElement elt; + public String name; + public String fullName; + public DiffElement parent; + + private ElementTest(DiffValue elt, DiffValue parent, String name, String fullName) { + this(elt.getBinding(), parent.getBinding(), name, fullName); + } + + private ElementTest(DiffElement elt, DiffElement parent, String name, String fullName) { + super(ElementTest.class); + this.elt = elt; + this.name = name; + this.fullName = fullName; + this.parent = parent; + } + + public String toString() { + return String.format("ElementTest elt=%s name=%s fullName=%s parent=%s", + elt.toOneLineString(), name, fullName, parent.getName()); + } + } + + @DataProvider(name = "elementdata") + public Object[][] createElementData() { + new ElementTest(MY_ROOT.getBinding(), DiffElement.ROOT, "MY_ROOT", "MY_ROOT"); + new ElementTest(NODE_C, MY_ROOT, "C", "MY_ROOT.C"); + new ElementTest(NODE_D, MY_ROOT, "D", "MY_ROOT.D"); + new ElementTest(Value_A, MY_ROOT, "A", "MY_ROOT.A"); + new ElementTest(Value_B, MY_ROOT, "B", "MY_ROOT.B"); + new ElementTest(Value_E, NODE_C, "E", "MY_ROOT.C.E"); + new ElementTest(Value_F, NODE_D, "F", "MY_ROOT.D.F"); + new ElementTest(Value_G, NODE_D, "G", "MY_ROOT.D.G"); + return TestDataProvider.getTests(ElementTest.class); + } + + @Test(enabled = true, dataProvider = "elementdata") + public void testElementMethods(ElementTest test) { + Assert.assertNotNull(test.elt.getName()); + Assert.assertNotNull(test.elt.getParent()); + Assert.assertEquals(test.elt.getName(), test.name); + Assert.assertEquals(test.elt.getParent(), test.parent); + Assert.assertEquals(test.elt.fullyQualifiedName(), test.fullName); + } + + // -------------------------------------------------------------------------------- + // + // DiffValue testing routines + // + // -------------------------------------------------------------------------------- + + private class LeafTest extends TestDataProvider { + public DiffValue diffvalue; + public Object value; + + private LeafTest(DiffValue diffvalue, Object value) { + super(LeafTest.class); + this.diffvalue = diffvalue; + this.value = value; + } + + public String toString() { + return String.format("LeafTest diffvalue=%s value=%s", diffvalue.toOneLineString(), value); + } + } + + @DataProvider(name = "leafdata") + public Object[][] createLeafData() { + new LeafTest(Value_A, "A"); + new LeafTest(Value_B, "B"); + new LeafTest(Value_E, "E"); + new LeafTest(Value_F, "F"); + new LeafTest(Value_G, "G"); + return TestDataProvider.getTests(LeafTest.class); + } + + @Test(enabled = true, dataProvider = "leafdata") + public void testLeafMethods(LeafTest test) { + Assert.assertNotNull(test.diffvalue.getValue()); + Assert.assertEquals(test.diffvalue.getValue(), test.value); + } + + // -------------------------------------------------------------------------------- + // + // Node testing routines + // + // -------------------------------------------------------------------------------- + + private class NodeTest extends TestDataProvider { + public DiffNode node; + public Set fields; + public Set subnodes; + public Set allNames; + + private NodeTest(DiffNode node, List fields, List subnodes) { + super(NodeTest.class); + this.node = node; + this.fields = new HashSet(fields); + this.subnodes = new HashSet(subnodes); + this.allNames = new HashSet(fields); + allNames.addAll(subnodes); + } + + public String toString() { + return String.format("NodeTest node=%s fields=%s subnodes=%s", + node.toOneLineString(), fields, subnodes); + } + } + + @DataProvider(name = "nodedata") + public Object[][] createData1() { + new NodeTest(MY_ROOT, Arrays.asList("A", "B"), Arrays.asList("C", "D")); + new NodeTest(NODE_C, Arrays.asList("E"), Collections.emptyList()); + new NodeTest(NODE_D, Arrays.asList("F", "G"), Collections.emptyList()); + return TestDataProvider.getTests(NodeTest.class); + } + + @Test(enabled = true, dataProvider = "nodedata") + public void testNodeAccessors(NodeTest test) { + Assert.assertNotNull(test.node.getElements()); + + for ( String name : test.allNames ) { + DiffElement elt = test.node.getElement(name); + Assert.assertNotNull(elt, "Failed to find field " + elt + " in " + test.node); + Assert.assertEquals(elt.getName(), name); + Assert.assertEquals(elt.getValue().isAtomic(), test.fields.contains(name), "Failed atomic/compound expectation: " + test.node); + } + } + + // NOTE: add routines are being implicitly tested by the creation of the data structures + + @Test(enabled = true, dataProvider = "nodedata") + public void testCounts(NodeTest test) { + Assert.assertEquals(test.node.getElements().size(), test.allNames.size()); + Assert.assertEquals(test.node.getElementNames(), test.allNames); + } + + // -------------------------------------------------------------------------------- + // + // fromString testing routines + // + // -------------------------------------------------------------------------------- + + private class FromStringTest extends TestDataProvider { + public String string; + public DiffElement expected; + + private FromStringTest(String string, DiffElement expected) { + super(FromStringTest.class); + this.string = string; + this.expected = expected; + } + + public String toString() { + return String.format("FromStringTest string=%s expected=%s", string, expected.toOneLineString()); + } + } + + @DataProvider(name = "fromstringdata") + public Object[][] createFromData() { + new FromStringTest("A=A", Value_A.getBinding()); + new FromStringTest("B=B", Value_B.getBinding()); + new FromStringTest("C=(E=E)", NODE_C.getBinding()); + new FromStringTest("D=(F=F G=G)", NODE_D.getBinding()); + return TestDataProvider.getTests(FromStringTest.class); + } + + @Test(enabled = true, dataProvider = "fromstringdata") + public void parseFromString(FromStringTest test) { + logger.warn("Testing from string: " + test.string); + DiffElement elt = DiffNode.fromString(test.string); + Assert.assertEquals(elt.toOneLineString(), test.expected.toOneLineString()); + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffableReaderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffableReaderUnitTest.java new file mode 100644 index 000000000..e20ba1625 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DiffableReaderUnitTest.java @@ -0,0 +1,173 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffEngine; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.DiffableReader; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DiffableReaderUnitTest extends BaseTest { + DiffEngine engine; + + File vcfFile = new File(privateTestDir + "diffTestMaster.vcf"); + File bamFile = new File(publicTestDir + "exampleBAM.bam"); + + @BeforeClass(enabled = true) + public void createDiffEngine() { + engine = new DiffEngine(); + } + + @Test(enabled = true) + public void testPluggableDiffableReaders() { + logger.warn("testPluggableDiffableReaders"); + Map readers = engine.getReaders(); + Assert.assertNotNull(readers); + Assert.assertTrue(readers.size() > 0); + Assert.assertNotNull(readers.get("VCF")); + for ( Map.Entry e : engine.getReaders().entrySet() ) { + logger.warn("Found diffable reader: " + e.getKey()); + Assert.assertEquals(e.getValue().getName(), e.getKey()); + Assert.assertEquals(e.getValue(), engine.getReader(e.getKey())); + } + } + + private static void testLeaf(DiffNode rec, String field, Object expected) { + DiffElement value = rec.getElement(field); + Assert.assertNotNull(value, "Expected to see leaf named " + field + " in rec " + rec); + Assert.assertEquals(value.getValue().getValue(), expected, "Expected to see leaf named " + field + " to have value " + expected + " in rec " + rec + " but got instead " + value.getValue().getValue()); + } + + @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") + public void testVCF1() { + logger.warn("testVCF1"); + DiffableReader vcfReader = engine.getReader("VCF"); + Assert.assertTrue(vcfReader.canRead(vcfFile)); + Assert.assertFalse(vcfReader.canRead(bamFile)); + + DiffElement diff = vcfReader.readFromFile(vcfFile, -1); + Assert.assertNotNull(diff); + + Assert.assertEquals(diff.getName(), vcfFile.getName()); + Assert.assertSame(diff.getParent(), DiffElement.ROOT); + + DiffNode node = diff.getValueAsNode(); + Assert.assertEquals(node.getElements().size(), 11); + + // chr1 2646 rs62635284 G A 0.15 PASS AC=2;AF=1.00;AN=2 GT:AD:DP:GL:GQ 1/1:53,75:3:-12.40,-0.90,-0.00:9.03 + DiffNode rec1 = node.getElement("chr1:2646").getValueAsNode(); + testLeaf(rec1, "CHROM", "chr1"); + testLeaf(rec1, "POS", 2646); + testLeaf(rec1, "ID", "rs62635284"); + testLeaf(rec1, "REF", Allele.create("G", true)); + testLeaf(rec1, "ALT", Arrays.asList(Allele.create("A"))); + testLeaf(rec1, "QUAL", 0.15); + testLeaf(rec1, "FILTER", VCFConstants.PASSES_FILTERS_v4); + testLeaf(rec1, "AC", "2"); + testLeaf(rec1, "AF", "1.00"); + testLeaf(rec1, "AN", "2"); + } + + @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") + public void testBAM() { + logger.warn("testBAM"); + DiffableReader bamReader = engine.getReader("BAM"); + Assert.assertTrue(bamReader.canRead(bamFile)); + Assert.assertFalse(bamReader.canRead(vcfFile)); + + DiffElement diff = bamReader.readFromFile(bamFile, -1); + Assert.assertNotNull(diff); + + Assert.assertEquals(diff.getName(), bamFile.getName()); + Assert.assertSame(diff.getParent(), DiffElement.ROOT); + + DiffNode node = diff.getValueAsNode(); + Assert.assertEquals(node.getElements().size(), 33); + + // 30PPJAAXX090125:1:42:512:1817#0 99 chr1 200 0 76M = + // 255 -130 ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC + // BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3: + // PG:Z:0 RG:Z:exampleBAM.bam SM:Z:exampleBAM.bam + + DiffNode rec1 = node.getElement("30PPJAAXX090125:1:42:512:1817#0_1").getValueAsNode(); + testLeaf(rec1, "NAME", "30PPJAAXX090125:1:42:512:1817#0"); + testLeaf(rec1, "FLAGS", 99); + testLeaf(rec1, "RNAME", "chr1"); + testLeaf(rec1, "POS", 200); + testLeaf(rec1, "MAPQ", 0); + testLeaf(rec1, "CIGAR", "76M"); + testLeaf(rec1, "RNEXT", "chr1"); + testLeaf(rec1, "PNEXT", 255); + testLeaf(rec1, "TLEN", -130); + testLeaf(rec1, "SEQ", "ACCCTAACCCTAACCCTAACCCTAACCATAACCCTAAGACTAACCCTAAACCTAACCCTCATAATCGAAATACAAC"); + testLeaf(rec1, "QUAL", "BBBBC@C?AABCBB<63>=B@>+B9-9+)2B8,+@327B5A>90((>-+''3?(/'''A)(''19('7.,**%)3:"); + testLeaf(rec1, "PG", "0"); + testLeaf(rec1, "RG", "exampleBAM.bam"); + testLeaf(rec1, "SM", "exampleBAM.bam"); + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DifferenceUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DifferenceUnitTest.java new file mode 100644 index 000000000..cee923476 --- /dev/null +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/diffengine/DifferenceUnitTest.java @@ -0,0 +1,118 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2014 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + + +// the imports for unit testing. + + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffNode; +import org.broadinstitute.gatk.utils.diffengine.Difference; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +/** + * Basic unit test for DifferableReaders in reduced reads + */ +public class DifferenceUnitTest extends BaseTest { + // -------------------------------------------------------------------------------- + // + // testing routines + // + // -------------------------------------------------------------------------------- + + private class DifferenceTest extends TestDataProvider { + public DiffElement tree1, tree2; + public String difference; + + private DifferenceTest(String tree1, String tree2, String difference) { + this(DiffNode.fromString(tree1), DiffNode.fromString(tree2), difference); + } + + private DifferenceTest(DiffElement tree1, DiffElement tree2, String difference) { + super(DifferenceTest.class); + this.tree1 = tree1; + this.tree2 = tree2; + this.difference = difference; + } + + public String toString() { + return String.format("tree1=%s tree2=%s diff=%s", + tree1 == null ? "null" : tree1.toOneLineString(), + tree2 == null ? "null" : tree2.toOneLineString(), + difference); + } + } + + @DataProvider(name = "data") + public Object[][] createTrees() { + new DifferenceTest("A=X", "A=Y", "A:1:X!=Y"); + new DifferenceTest("A=Y", "A=X", "A:1:Y!=X"); + new DifferenceTest(DiffNode.fromString("A=X"), null, "A:1:X!=MISSING"); + new DifferenceTest(null, DiffNode.fromString("A=X"), "A:1:MISSING!=X"); + return DifferenceTest.getTests(DifferenceTest.class); + } + + @Test(enabled = true, dataProvider = "data") + public void testDiffToString(DifferenceTest test) { + logger.warn("Test tree1: " + (test.tree1 == null ? "null" : test.tree1.toOneLineString())); + logger.warn("Test tree2: " + (test.tree2 == null ? "null" : test.tree2.toOneLineString())); + logger.warn("Test expected diff : " + test.difference); + Difference diff = new Difference(test.tree1, test.tree2); + logger.warn("Observed diffs : " + diff); + Assert.assertEquals(diff.toString(), test.difference, "Observed diff string " + diff + " not equal to expected difference string " + test.difference ); + + } +} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java deleted file mode 100644 index 619e96654..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoodsUnitTest.java +++ /dev/null @@ -1,857 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.genotyper; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.tools.walkers.genotyper.*; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.testng.Assert; -import org.testng.SkipException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * Test code for {@link ReadLikelihoods} - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class ReadLikelihoodsUnitTest -{ - private static final double EPSILON = 1e-6; - private static final int ODD_READ_START = 101; - private static final int EVEN_READ_START = 1; - - @Test(dataProvider = "dataSets") - public void testInstantiationAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - - Assert.assertEquals(result.sampleCount(), samples.length); - Assert.assertEquals(result.alleleCount(), alleles.length); - - - testSampleQueries(samples, reads, result); - testAlleleQueries(alleles, result); - testLikelihoodMatrixQueries(samples, result, null); - } - - @Test(dataProvider = "dataSets") - public void testLikelihoodFillingAndQuery(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods result = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] likelihoods = fillWithRandomLikelihoods(samples, alleles, result); - testLikelihoodMatrixQueries(samples, result, likelihoods); - } - - private double[][][] fillWithRandomLikelihoods(final String[] samples, final Allele[] alleles, final ReadLikelihoods result) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - final double[][][] likelihoods = new double[samples.length][alleles.length][]; - for (int s = 0; s < likelihoods.length; s++) { - final ReadLikelihoods.Matrix sampleLikelihoods = result.sampleMatrix(s); - for (int a = 0; a < likelihoods[s].length; a++) { - likelihoods[s][a] = new double[result.sampleReadCount(s)]; - for (int r = 0; r < likelihoods[s][a].length; r++) - sampleLikelihoods.set(a,r,likelihoods[s][a][r] = -Math.abs(rnd.nextGaussian())); - } - } - return likelihoods; - } - - @Test(dataProvider = "dataSets") - public void testBestAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - fillWithRandomLikelihoods(samples,alleles,original); - final int alleleCount = alleles.length; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); - final double[] bestLkArray = new double[sampleReadCount]; - final int[] bestIndexArray = new int[sampleReadCount]; - final double[] confidenceArray = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - int bestAlleleIndex = -1; - double bestAlleleLk = Double.NEGATIVE_INFINITY; - double secondBestAlleleLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - final double lk = sampleMatrix.get(a,r); - if (lk > bestAlleleLk) { - secondBestAlleleLk = bestAlleleLk; - bestAlleleLk = lk; - bestAlleleIndex = a; - } else if (lk > secondBestAlleleLk) { - secondBestAlleleLk = lk; - } - } - bestLkArray[r] = bestAlleleLk; - confidenceArray[r] = bestAlleleLk - secondBestAlleleLk; - bestIndexArray[r] = bestAlleleIndex; - } - final Collection.BestAllele> bestAlleles = original.bestAlleles(); - for (final ReadLikelihoods.BestAllele bestAllele : bestAlleles) { - final int readIndex = original.readIndex(s,bestAllele.read); - if (readIndex == -1) continue; - Assert.assertEquals(bestLkArray[readIndex],bestAllele.likelihood); - Assert.assertEquals(bestAllele.allele,alleles[bestIndexArray[readIndex]]); - Assert.assertEquals(bestAllele.confidence,confidenceArray[readIndex],EPSILON); - } - } - } - - @Test(dataProvider = "dataSets") - public void testBestAlleleMap(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - fillWithRandomLikelihoods(samples,alleles,original); - final Map> expected = new HashMap<>(alleles.length); - for (final Allele allele : alleles) - expected.put(allele,new ArrayList()); - - final int alleleCount = alleles.length; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); - for (int r = 0; r < sampleReadCount; r++) { - int bestAlleleIndex = -1; - double bestAlleleLk = Double.NEGATIVE_INFINITY; - double secondBestAlleleLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - final double lk = sampleMatrix.get(a,r); - if (lk > bestAlleleLk) { - secondBestAlleleLk = bestAlleleLk; - bestAlleleLk = lk; - bestAlleleIndex = a; - } else if (lk > secondBestAlleleLk) { - secondBestAlleleLk = lk; - } - } - if ((bestAlleleLk - secondBestAlleleLk) > ReadLikelihoods.BestAllele.INFORMATIVE_THRESHOLD) - expected.get(alleles[bestAlleleIndex]).add(sampleMatrix.readAt(r)); - } - } - - final Map> actual = original.readsByBestAlleleMap(); - - Assert.assertEquals(actual.size(),alleles.length); - for (final Allele allele : alleles) { - final List expectedList = expected.get(allele); - final List actualList = actual.get(allele); - final Set expectedSet = new HashSet<>(expectedList); - final Set actualSet = new HashSet<>(actualList); - Assert.assertEquals(actualSet,expectedSet); - } - } - - @Test(dataProvider = "dataSets") - public void testFilterPoorlyModeledReads(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int r = 0; r < sampleReadCount; r++) { - if ((r & 1) == 0) continue; - for (int a = 0; a < alleles.length; a++) - original.sampleMatrix(s).set(a,r,-10000); - } - } - - final ReadLikelihoods result = original.clone(); - result.filterPoorlyModeledReads(2.0); - - for (int s = 0; s < samples.length; s++) { - final int oldSampleReadCount = original.sampleReadCount(s); - final int newSampleReadCount = result.sampleReadCount(s); - Assert.assertEquals(newSampleReadCount,(oldSampleReadCount + 1) / 2); - final ReadLikelihoods.Matrix newSampleMatrix = result.sampleMatrix(s); - final ReadLikelihoods.Matrix oldSampleMatrix = original.sampleMatrix(s); - for (int r = 0 ; r < newSampleReadCount; r++) { - Assert.assertEquals(original.readIndex(s, result.sampleReads(s).get(r)), r * 2); - for (int a = 0; a < alleles.length; a++) { - Assert.assertEquals(newSampleMatrix.get(a,r),oldSampleMatrix.get(a,r*2)); - } - } - } - } - - @Test(dataProvider = "dataSets") - public void testFilterReadsToOverlap(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); - fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result = original.clone(); - result.filterToOnlyOverlappingUnclippedReads(evenReadOverlap); - final double[][][] newLikelihoods = new double[samples.length][alleles.length][]; - for (int s = 0; s < samples.length ; s++) - for (int a = 0; a < alleles.length; a++) { - newLikelihoods[s][a] = new double[(original.sampleReadCount(s) + 1) / 2]; - final ReadLikelihoods.Matrix sampleMatrix = original.sampleMatrix(s); - for (int r = 0; r < newLikelihoods[s][a].length; r++) { - Assert.assertEquals(result.readIndex(s,sampleMatrix.readAt(r << 1)),r); - newLikelihoods[s][a][r] = sampleMatrix.get(a, r << 1); - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - @Test(dataProvider = "marginalizationDataSets") - public void testMarginalizationWithOverlap(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final GenomeLoc evenReadOverlap = locParser.createGenomeLoc(SAM_HEADER.getSequenceDictionary().getSequences().get(0).getSequenceName(),EVEN_READ_START ,EVEN_READ_START ); - fillWithRandomLikelihoods(samples, alleles, original); - final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping,evenReadOverlap); - Assert.assertNotNull(marginalized); - Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); - for (int a = 0; a < marginalized.alleleCount(); a++) { - final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); - Assert.assertNotNull(oldAlleles); - for (int s = 0; s < samples.length; s++) { - final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); - final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); - final int sampleReadCount = sampleLikelihoods.readCount(); - final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); - Assert.assertEquals(sampleReadCount,(oldSampleReadCount + 1) / 2); - for (int r = 0; r < sampleReadCount; r++) { - double oldBestLk = Double.NEGATIVE_INFINITY; - for (final Allele oldAllele : oldAlleles) { - oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r << 1), oldBestLk); - } - Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); - } - } - } - } - - @Test(dataProvider = "marginalizationDataSets") - public void testMarginalization(final String[] samples, final Allele[] alleles, final Map> reads, final Map> newToOldAlleleMapping) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - fillWithRandomLikelihoods(samples, alleles, original); - final ReadLikelihoods marginalized = original.marginalize(newToOldAlleleMapping); - Assert.assertNotNull(marginalized); - Assert.assertEquals(newToOldAlleleMapping.size(),marginalized.alleleCount()); - for (int a = 0; a < marginalized.alleleCount(); a++) { - final List oldAlleles = newToOldAlleleMapping.get(marginalized.alleleAt(a)); - Assert.assertNotNull(oldAlleles); - for (int s = 0; s < samples.length; s++) { - final ReadLikelihoods.Matrix oldSmapleLikelihoods = original.sampleMatrix(s); - final ReadLikelihoods.Matrix sampleLikelihoods = marginalized.sampleMatrix(s); - final int sampleReadCount = sampleLikelihoods.readCount(); - final int oldSampleReadCount = oldSmapleLikelihoods.readCount(); - Assert.assertEquals(oldSampleReadCount,sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) { - double oldBestLk = Double.NEGATIVE_INFINITY; - for (final Allele oldAllele : oldAlleles) { - oldBestLk = Math.max(oldSmapleLikelihoods.get(original.alleleIndex(oldAllele),r), oldBestLk); - } - Assert.assertEquals(sampleLikelihoods.get(a,r),oldBestLk); - } - } - } - } - - @Test(dataProvider = "dataSets") - public void testNormalizeBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result= original.clone(); - result.normalizeLikelihoods(true, Double.NEGATIVE_INFINITY); - testAlleleQueries(alleles,result); - final int alleleCount = alleles.length; - final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int a = 0; a < alleleCount; a++) - newLikelihoods[s][a] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestLk = originalLikelihoods[s][0][r]; - for (int a = 1; a < alleleCount; a++) { - bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); - } - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; - } - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - @Test(dataProvider = "dataSets") - public void testNormalizeCapWorstLK(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result= original.clone(); - result.normalizeLikelihoods(false, - 0.001); - testAlleleQueries(alleles,result); - final int alleleCount = alleles.length; - final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int a = 0; a < alleleCount; a++) - newLikelihoods[s][a] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestAltLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - if (alleles[a].isReference()) - continue; - bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); - } - if (bestAltLk == Double.NEGATIVE_INFINITY) - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = originalLikelihoods[s][a][r]; - } - else - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001); - } - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - @Test(dataProvider = "dataSets") - public void testNormalizeCapWorstLKAndBestToZero(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result= original.clone(); - result.normalizeLikelihoods(true, - 0.001); - testAlleleQueries(alleles,result); - final int alleleCount = alleles.length; - final double[][][] newLikelihoods = new double[originalLikelihoods.length][alleles.length][]; - for (int s = 0; s < samples.length; s++) { - final int sampleReadCount = original.sampleReadCount(s); - for (int a = 0; a < alleleCount; a++) - newLikelihoods[s][a] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestAltLk = Double.NEGATIVE_INFINITY; - double bestLk = Double.NEGATIVE_INFINITY; - for (int a = 0; a < alleleCount; a++) { - bestLk = Math.max(bestLk,originalLikelihoods[s][a][r]); - if (alleles[a].isReference()) - continue; - bestAltLk = Math.max(bestAltLk,originalLikelihoods[s][a][r]); - } - if (bestAltLk == Double.NEGATIVE_INFINITY) - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = originalLikelihoods[s][a][r] - bestLk; - } - else - for (int a = 0; a < alleleCount; a++) { - newLikelihoods[s][a][r] = Math.max(originalLikelihoods[s][a][r],bestAltLk - 0.001) - bestLk; - } - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - - @Test(dataProvider = "dataSets") - public void testAddMissingAlleles(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result = original.clone(); - - // If all the alleles pass are present in the read-likelihoods collection there is no change. - result.addMissingAlleles(result.alleles(),Double.NEGATIVE_INFINITY); - testLikelihoodMatrixQueries(samples,result,originalLikelihoods); - - // If the allele list passed is empty there is no effect. - result.addMissingAlleles(Collections.EMPTY_LIST,Double.NEGATIVE_INFINITY); - testLikelihoodMatrixQueries(samples,result,originalLikelihoods); - - final Allele newOne; - final Allele newTwo; - final Allele newThree; - - // We add a single missing. - result.addMissingAlleles(Arrays.asList(newOne = Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-12345.6); - Assert.assertEquals(result.alleleCount(), original.alleleCount() + 1); - - // We add too more amongst exisisting alleles: - result.addMissingAlleles(Arrays.asList(newTwo = Allele.create("ATATATTATATTAATATT".getBytes(), false),result.alleleAt(1), - result.alleleAt(0),newThree = Allele.create("TGTGTGTATTG".getBytes(),false),Allele.create("ACCCCCAAAATTTAAAGGG".getBytes(),false)),-6.54321); - - Assert.assertEquals(original.alleleCount()+3,result.alleleCount()); - - final List expectedAlleles = new ArrayList<>(original.alleles()); - expectedAlleles.add(newOne); expectedAlleles.add(newTwo); expectedAlleles.add(newThree); - - Assert.assertEquals(result.alleles(),expectedAlleles); - - final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; - for (int s = 0; s < samples.length; s++) { - newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 3); - final int sampleReadCount = original.sampleReadCount(s); - final int originalAlleleCount = originalLikelihoods[s].length; - newLikelihoods[s][originalAlleleCount] = new double[sampleReadCount]; - Arrays.fill(newLikelihoods[s][originalAlleleCount],-12345.6); - newLikelihoods[s][originalAlleleCount+1] = new double[sampleReadCount]; - Arrays.fill(newLikelihoods[s][originalAlleleCount+1],-6.54321); - newLikelihoods[s][originalAlleleCount+2] = new double[sampleReadCount]; - Arrays.fill(newLikelihoods[s][originalAlleleCount+2],-6.54321); - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - - @Test(dataProvider = "dataSets") - public void testAddNonRefAllele(final String[] samples, final Allele[] alleles, final Map> reads) { - final ReadLikelihoods original = new ReadLikelihoods<>(new IndexedSampleList(samples), new IndexedAlleleList<>(alleles), reads); - final double[][][] originalLikelihoods = fillWithRandomLikelihoods(samples,alleles,original); - final ReadLikelihoods result = original.clone(); - result.addNonReferenceAllele(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - Assert.assertEquals(result.alleleCount(),original.alleleCount() + 1); - Assert.assertEquals(result.alleleIndex(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE),result.alleleCount() - 1); - final double[][][] newLikelihoods = new double[originalLikelihoods.length][][]; - for (int s = 0; s < samples.length; s++) { - newLikelihoods[s] = Arrays.copyOf(originalLikelihoods[s],originalLikelihoods[s].length + 1); - final int sampleReadCount = original.sampleReadCount(s); - final int ordinaryAlleleCount = originalLikelihoods[s].length; - newLikelihoods[s][ordinaryAlleleCount] = new double[sampleReadCount]; - for (int r = 0; r < sampleReadCount; r++) { - double bestLk = newLikelihoods[s][0][r]; - double secondBestLk = Double.NEGATIVE_INFINITY; - for (int a = 1; a < ordinaryAlleleCount; a++) { - final double lk = originalLikelihoods[s][a][r]; - if (lk > bestLk) { - secondBestLk = bestLk; - bestLk = lk; - } else if (lk > secondBestLk) { - secondBestLk = lk; - } - } - final double expectedNonRefLk = Double.isInfinite(secondBestLk) ? bestLk : secondBestLk; - newLikelihoods[s][ordinaryAlleleCount][r] = expectedNonRefLk; - } - } - testLikelihoodMatrixQueries(samples,result,newLikelihoods); - } - - private void testLikelihoodMatrixQueries(String[] samples, ReadLikelihoods result, final double[][][] likelihoods) { - for (final String sample : samples) { - final int sampleIndex = result.sampleIndex(sample); - final int sampleReadCount = result.sampleReadCount(sampleIndex); - final int alleleCount = result.alleleCount(); - Assert.assertEquals(result.alleleCount(), alleleCount); - for (int a = 0; a < alleleCount; a++) { - Assert.assertEquals(result.sampleReadCount(sampleIndex),sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) - Assert.assertEquals(result.sampleMatrix(sampleIndex).get(a,r), - likelihoods == null ? 0.0 : likelihoods[sampleIndex][a][r], EPSILON); - } - } - } - - private void testAlleleQueries(Allele[] alleles, ReadLikelihoods result) { - final Set alleleIndices = new HashSet<>(); - for (final Allele allele : alleles) { - final int alleleIndex = result.alleleIndex(allele); - Assert.assertTrue(alleleIndex >= 0); - Assert.assertFalse(alleleIndices.contains(alleleIndex)); - alleleIndices.add(alleleIndex); - Assert.assertSame(allele,alleles[alleleIndex]); - } - } - - private void testSampleQueries(String[] samples, Map> reads, ReadLikelihoods result) { - final Set sampleIds = new HashSet<>(samples.length); - for (final String sample : samples) { - final int sampleIndex = result.sampleIndex(sample); - Assert.assertTrue(sampleIndex >= 0); - Assert.assertFalse(sampleIds.contains(sampleIndex)); - sampleIds.add(sampleIndex); - - final List sampleReads = result.sampleReads(sampleIndex); - final Set sampleReadsSet = new HashSet<>(sampleReads); - final List expectedSampleReadArray = reads.get(sample); - final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); - Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); - - final int sampleReadCount = sampleReads.size(); - for (int r = 0; r < sampleReadCount; r++) { - Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); - final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); - Assert.assertEquals(readIndex,r); - } - } - } - - private String[][] SAMPLE_SETS = new String[][] { - {"A","B","C"}, - {"A"}, - {"C","A","D","E","Salsa","Gazpacho"}, - }; - - private Allele[][] ALLELE_SETS = new Allele[][] { - {Allele.create("A",true), Allele.create("T"), Allele.create("C")}, - {Allele.create("A",true)}, - {Allele.create("ATTTA"), Allele.create("A",true)}, - {Allele.create("A"), Allele.create("AT",true)}, - {Allele.create("A",false), Allele.create("AT",false)}, - }; - - @DataProvider(name="marginalizationDataSets") - public Object[][] marginalizationDataSets() { - try { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length * ALLELE_SETS.length][]; - int nextIndex = 0; - for (int s = 0; s < SAMPLE_SETS.length; s++) { - for (int a = 0; a < ALLELE_SETS.length; a++) { - for (int b = 0; b < ALLELE_SETS.length; b++) { - if (ALLELE_SETS[b].length < ALLELE_SETS[a].length) - result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], - dataSetReads(SAMPLE_SETS[s], rnd), randomAlleleMap(ALLELE_SETS[a], ALLELE_SETS[b]) - }; - } - } - } - return Arrays.copyOf(result,nextIndex); - }catch (final Throwable e) { - throw new RuntimeException(e); - } - } - - private Map> randomAlleleMap(final Allele[] fromAlleles, final Allele[] toAlleles) { - final Map> result = new HashMap<>(toAlleles.length); - for (final Allele toAllele : toAlleles ) - result.put(toAllele,new ArrayList(fromAlleles.length)); - final ArrayList remaining = new ArrayList<>(Arrays.asList(fromAlleles)); - int nextToIndex = 0; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - for (int i = 0; i < fromAlleles.length; i++) { - final int fromAlleleIndex = rnd.nextInt(remaining.size()); - result.get(toAlleles[nextToIndex]).add(remaining.remove(fromAlleleIndex)); - nextToIndex = (nextToIndex + 1) % toAlleles.length; - } - return result; - } - - - @DataProvider(name="dataSets") - public Object[][] dataSets() { - try { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - final Object[][] result = new Object[SAMPLE_SETS.length * ALLELE_SETS.length][]; - int nextIndex = 0; - for (int s = 0; s < SAMPLE_SETS.length; s++) - for (int a = 0; a < ALLELE_SETS.length; a++) { - result[nextIndex++] = new Object[]{SAMPLE_SETS[s], ALLELE_SETS[a], - dataSetReads(SAMPLE_SETS[s], rnd) - }; - } - return result; - }catch (final Throwable e) { - throw new RuntimeException(e); - } - } - - private Map> dataSetReads(final String[] samples, - final Random rnd) { - final Map> result = new HashMap<>(samples.length); - for (final String sample : samples) { - final int readCount = rnd.nextInt(100); - final List reads = new ArrayList<>(readCount); - for (int r = 0; r < readCount; r++) { - final int alignmentStart = (r & 1) == 0 ? EVEN_READ_START : ODD_READ_START; - reads.add(ArtificialSAMUtils.createArtificialRead(SAM_HEADER, - "RRR" + sample + "00" + r, 0, alignmentStart ,"AAAAA".getBytes(), new byte[] {30,30,30,30,30}, "5M")); - } - result.put(sample,reads); - } - return result; - } - - @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") - public void testInstantiationAndBasicQueries(final int[] readCounts, final int alleleCount, final boolean hasReference) { - final SampleList sampleList = sampleList(readCounts); - - final AlleleList alleleList = alleleList(alleleCount,hasReference); - final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList, readCounts); - final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); - - AlleleListUnitTester.assertAlleleList(subject, AlleleListUtils.asList(alleleList)); - SampleListUnitTester.assertSampleList(subject,SampleListUtils.asList(sampleList)); - - if (hasReference) { - final int referenceIndex = AlleleListUtils.indexOfReference(alleleList); - Assert.assertTrue(referenceIndex >= 0); - Assert.assertEquals(AlleleListUtils.indexOfReference(alleleList),referenceIndex); - } else { - Assert.assertEquals(AlleleListUtils.indexOfReference(subject), -1); - } - - testLikelihoodMatrixQueries(alleleList, sampleList, sampleToReads, subject); - testAlleleQueries(alleleList, subject); - testSampleQueries(sampleList, sampleToReads, subject); - } - - @Test(dataProvider="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") - public void testLikelihoodWriting(final int[] readCounts, final int alleleCount, final boolean hasReference) { - final SampleList sampleList = sampleList(readCounts); - - final AlleleList alleleList = alleleList(alleleCount,hasReference); - final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); - final ReadLikelihoods subject = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); - - final int sampleCount = readCounts.length; - int totalLikelihoodsSet = 0; - int expectedLikelihoodsSet = 0; - for (int s = 0; s < sampleCount; s++) { - expectedLikelihoodsSet += readCounts[s] * alleleCount; - final ReadLikelihoods.Matrix matrix = subject.sampleMatrix(s); - final int readCount = matrix.readCount(); - for (int a = 0; a < alleleCount; a++) - for (int r = 0; r < readCount; r++) { - final double likelihood = testLikelihood(s, a, r); - Assert.assertNotEquals(likelihood,0); //Paranoia - totalLikelihoodsSet++; - matrix.set(a,r,likelihood); - Assert.assertEquals(matrix.get(a, r),likelihood); - } - - } - Assert.assertEquals(totalLikelihoodsSet,expectedLikelihoodsSet); - } - - @Test(dependsOnMethods={"testLikelihoodWriting","testInstantiationAndBasicQueries"}, - dataProvider="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") - public void testMapConversion(final int[] readCounts, final int alleleCount, final boolean hasReference) { - final SampleList sampleList = sampleList(readCounts); - - final AlleleList alleleList = alleleList(alleleCount,hasReference); - final Map> sampleToReads = ReadLikelihoodsUnitTester.sampleToReads(sampleList,readCounts); - - final Set alleleWithLikelihoodsSet = new HashSet<>(); - final Set readsWithLikelihoodsSet = new HashSet<>(); - final Map map = new HashMap<>(sampleList.sampleCount()); - final int sampleCount = sampleList.sampleCount(); - for (int s = 0; s < sampleCount; s++) { - final String sample = sampleList.sampleAt(s); - final PerReadAlleleLikelihoodMap perSampleMap = new PerReadAlleleLikelihoodMap(); - final List reads = sampleToReads.get(sample); - for (int a = 0; a < alleleCount; a++) - for (int r = 0; r < reads.size(); r++) { - perSampleMap.add(reads.get(r), alleleList.alleleAt(a), testLikelihood(s, a, r)); - alleleWithLikelihoodsSet.add(alleleList.alleleAt(a)); - readsWithLikelihoodsSet.add(reads.get(r)); - } - map.put(sample,perSampleMap); - - } - - ReadLikelihoods subject = ReadLikelihoods.fromPerAlleleReadLikelihoodsMap(map); - - for (int s = 0; s < sampleCount; s++) { - final String sample = sampleList.sampleAt(s); - final int sIndex = subject.sampleIndex(sample); - Assert.assertTrue(sIndex >= 0); - Assert.assertTrue(sIndex < sampleCount); - final int sampleReadCount = sampleToReads.get(sample).size(); - final ReadLikelihoods.Matrix sampleLikelihoods = subject.sampleMatrix(sIndex); - for (int a = 0; a < alleleCount; a++) { - final Allele allele = alleleList.alleleAt(a); - final int aIndex = subject.alleleIndex(allele); - Assert.assertEquals(aIndex >= 0,alleleWithLikelihoodsSet.contains(allele)); - Assert.assertTrue(aIndex < alleleCount); - if (aIndex == -1) continue; - for (int r = 0; r < sampleReadCount; r++) { - final GATKSAMRecord read = sampleToReads.get(sample).get(r); - final int rIndex = subject.readIndex(sIndex,read); - final int rIndex2 = sampleLikelihoods.readIndex(read); - Assert.assertEquals(rIndex,rIndex2); - Assert.assertEquals(rIndex >= 0,readsWithLikelihoodsSet.contains(read)); - Assert.assertTrue(rIndex < sampleReadCount); - if (rIndex == -1) - continue; - final double likelihood = sampleLikelihoods.get(aIndex,rIndex); - Assert.assertEquals(likelihood,testLikelihood(s,a,r)); - } - } - } - } - - private double testLikelihood(final int sampleIndex, final int alleleIndex, final int readIndex) { - return - Math.abs(31 * (sampleIndex + 1) + 101 * alleleIndex + 1009 * readIndex); - } - - - private final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - private void testLikelihoodMatrixQueries(final AlleleList alleles, final SampleList samples, - final Map> sampleToReads, ReadLikelihoods result) { - for (final String sample : SampleListUtils.asList(samples)) { - final int sampleIndex = result.sampleIndex(sample); - final ReadLikelihoods.Matrix likelihoodMatrix = result.sampleMatrix(sampleIndex); - final int sampleReadCount = sampleToReads.get(sample).size(); - final List reads = sampleToReads.get(sample); - Assert.assertEquals(likelihoodMatrix.alleleCount(), alleles.alleleCount()); - Assert.assertEquals(likelihoodMatrix.readCount(), sampleReadCount); - for (int a = 0; a < likelihoodMatrix.alleleCount(); a++) { - Assert.assertEquals(likelihoodMatrix.alleleAt(a),alleles.alleleAt(a)); - for (int r = 0; r < sampleReadCount; r++) { - Assert.assertEquals(likelihoodMatrix.readAt(r),reads.get(r)); - Assert.assertEquals(likelihoodMatrix.get(a, r), 0.0); - } - } - } - } - - private void testAlleleQueries(final AlleleList alleles, ReadLikelihoods result) { - final Set alleleIndices = new HashSet<>(); - for (final Allele allele : AlleleListUtils.asList(alleles)) { - final int alleleIndex = result.alleleIndex(allele); - Assert.assertTrue(alleleIndex >= 0); - Assert.assertFalse(alleleIndices.contains(alleleIndex)); - alleleIndices.add(alleleIndex); - Assert.assertSame(allele,alleles.alleleAt(alleleIndex)); - } - } - - private void testSampleQueries(final SampleList samples, Map> reads, - final ReadLikelihoods result) { - final Set sampleIds = new HashSet<>(samples.sampleCount()); - for (final String sample : SampleListUtils.asList(samples)) { - final int sampleIndex = result.sampleIndex(sample); - Assert.assertTrue(sampleIndex >= 0); - Assert.assertFalse(sampleIds.contains(sampleIndex)); - sampleIds.add(sampleIndex); - - final List sampleReads = result.sampleReads(sampleIndex); - final Set sampleReadsSet = new HashSet<>(sampleReads); - final List expectedSampleReadArray = reads.get(sample); - final Set expectedSampleReadsSet = new HashSet<>(expectedSampleReadArray); - Assert.assertEquals(sampleReadsSet,expectedSampleReadsSet); - - final int sampleReadCount = sampleReads.size(); - for (int r = 0; r < sampleReadCount; r++) { - Assert.assertSame(sampleReads.get(r), expectedSampleReadArray.get(r)); - final int readIndex = result.readIndex(sampleIndex, sampleReads.get(r)); - Assert.assertEquals(readIndex,r); - } - } - } - - private AlleleList alleleList(final int alleleCount, final boolean hasReference) { - final Allele[] alleles = AlleleListUnitTester.generateRandomAlleles(alleleCount,100); - if (hasReference) { - final int referenceIndex = rnd.nextInt(alleleCount); - alleles[referenceIndex] = Allele.create(alleles[referenceIndex].getBases(),true); - } - final AlleleList alleleList = new IndexedAlleleList<>(alleles); - if (alleleList.alleleCount() != alleles.length) - throw new SkipException("repeated alleles, should be infrequent"); - return alleleList; - } - - private SAMFileHeader SAM_HEADER = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 1000); - final GenomeLocParser locParser = new GenomeLocParser(SAM_HEADER.getSequenceDictionary()); - - - private int[][] READ_COUNTS = new int[][] { - {}, - { 100 }, - { 0 }, - { 0, 0, 0 }, - { 1, 0, 1 }, - { 100, 10 , 100}, - { 1000, 10, 100, 20, 23 } - }; - - private int[] ALLELE_COUNTS = new int[] { 0, 1, 2, 3, 10, 20 }; - - @DataProvider(name="readCountsAndAlleleCountData") - public Object[][] readCountsAndAlleleCountData() { - final Object[][] result = new Object[READ_COUNTS.length * ALLELE_COUNTS.length * 2][]; - int index = 0; - for (final int[] readCounts : READ_COUNTS) - for (final int alleleCount : ALLELE_COUNTS) { - result[index++] = new Object[]{ readCounts, alleleCount, false}; - result[index++] = new Object[]{ readCounts, alleleCount, true}; - } - return result; - } - - @DataProvider(name="readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference") - public Object[][] readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference() { - final Object[][] raw = readCountsAndAlleleCountData(); - final List result = new ArrayList<>(raw.length); - for (final Object[] paramSet : raw) - if (!paramSet[2].equals(true) || !paramSet[1].equals(0)) - result.add(paramSet); - return result.toArray(new Object[result.size()][]); - } - - @DataProvider(name="readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference") - public Object[][] readCountsAndAlleleCountDataSkippingNoLikelihoodsOrNoAlleleAndWithReference() { - final Object[][] raw = readCountsAndAlleleCountDataSkippingNoAlleleAndWithReference(); - final List result = new ArrayList<>(raw.length); - for (final Object[] paramSet : raw) { - final int[] readCounts = (int[]) paramSet[0]; - final long totalReadCount = MathUtils.sum(readCounts); - if (totalReadCount > 0) - result.add(paramSet); - } - return result.toArray(new Object[result.size()][]); - } - - private SampleList sampleList(final int[] readCounts) { - final List samples = new ArrayList<>(readCounts.length); - for (int i = 0; i < readCounts.length; i++) - samples.add("SAMPLE_" + i); - return new IndexedSampleList(samples); - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparatorUnitTest.java deleted file mode 100644 index 528da1762..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparatorUnitTest.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class HaplotypeBaseComparatorUnitTest extends BaseTest { - @Test - public void testComparison() { - final List rawStrings = Arrays.asList("A", "C", "AC", "CT", "GTC", "ACGT"); - final List lexStrings = new ArrayList(rawStrings); - Collections.sort(lexStrings); - - for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { - final List haps = new ArrayList(seqs.size()); - for ( final String seq : seqs ) { - haps.add(new Haplotype(seq.getBytes(), false)); - } - - Collections.sort(haps, new HaplotypeBaseComparator()); - for ( int i = 0; i < lexStrings.size(); i++ ) - Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculatorUnitTest.java deleted file mode 100644 index aae88fbfb..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeLDCalculatorUnitTest.java +++ /dev/null @@ -1,123 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -public class HaplotypeLDCalculatorUnitTest extends BaseTest { - HaplotypeLDCalculator calculator; - - @BeforeMethod - public void setUp() throws Exception { - calculator = new HaplotypeLDCalculator(); - } - - /** - * Tests that we get the right values from the R^2 calculation - */ - @Test - public void computeProbOfBeingPhased() { - logger.warn("Executing testCalculateR2LD"); - - // See AA, AB, and BA in population - Assert.assertEquals(calculator.pPhasedTest(0, 0, 0, -100), 0, 0.00001); - - // See AA, AB, BB in population - Assert.assertTrue(calculator.pPhasedTest(0, 0, -100, 0) < 0.5); - - // See AA and BB in population - Assert.assertEquals(calculator.pPhasedTest(0, -100, -100, 0), 1, 0.00001); - - // See AA, AB, and BA but no BBs in population - Assert.assertEquals(calculator.pPhasedTest(0, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); - - // See BB, AB, and BA but no AAs in population, so BB is the best explanation - Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, 0), 1, 0.00001); - - // See only AB and BA but no AAs nor BBs in population - Assert.assertEquals(calculator.pPhasedTest(Double.NEGATIVE_INFINITY, -20, -40, Double.NEGATIVE_INFINITY), 0, 0.00001); - - // Previously bad input - Assert.assertEquals(calculator.pPhasedTest(-400, -600, -1200, Double.NEGATIVE_INFINITY), 0, 0.00001); - - // first variant is just bad, so BA and BB are both very bad, shouldn't be phased - Assert.assertEquals(calculator.pPhasedTest(0, -1000, -100, -10000), 0, 0.00001); - - // second variant is just bad, so AB and BB are both very bad, shouldn't be phased - Assert.assertEquals(calculator.pPhasedTest(0, -100, -1000, -10000), 0, 0.00001); - - // AA is very good, all all others are quite poor. Shouldn't be phased - Assert.assertEquals(calculator.pPhasedTest(0, -1000, -1000, -10000), 0, 0.00001); - - - for ( int i = -10; i > -10000; i -= 10 ) { - // only bad het states - Assert.assertTrue(calculator.pPhasedTest(0, i, i, 0) > 0.99, "Failed for " + i); - - // BB state is terrible - Assert.assertTrue(calculator.pPhasedTest(0, 0, 0, i) < 0.5, "Failed for " + i); - - // truth is AB, BA, and BB - Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, 0) < 0.5, "Failed for " + i); - - // truth is AB, BA - Assert.assertTrue(calculator.pPhasedTest(i, 0, 0, i) < 0.5, "Failed for " + i); - - // Only good signal is AB, so we shouldn't be phased - Assert.assertTrue(calculator.pPhasedTest(i, i, 0, i) < 0.5, "Failed for " + i); - Assert.assertTrue(calculator.pPhasedTest(i, 0, i, i) < 0.5, "Failed for " + i); - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparatorUnitTest.java deleted file mode 100644 index ea368e631..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparatorUnitTest.java +++ /dev/null @@ -1,81 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class HaplotypeScoreComparatorUnitTest extends BaseTest { - @Test - public void testComparison() { - final List scores = Arrays.asList(3.0, 2.0, 1.0); - for ( final List myScores : Utils.makePermutations(scores, scores.size(), false) ) { - final List haps = new ArrayList(myScores.size()); - for ( final double score : myScores ) { - final Haplotype h = new Haplotype("ACT".getBytes(), false); - h.setScore(score); - haps.add(h); - } - - Collections.sort(haps, new HaplotypeScoreComparator()); - for ( int i = 0; i < myScores.size(); i++ ) - Assert.assertEquals(haps.get(i).getScore(), scores.get(i)); - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java deleted file mode 100644 index 1808ac19a..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparatorUnitTest.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -/** - * User: btaylor - * Date: 8/1/13 - * Time: 11:09 AM - */ -public class HaplotypeSizeAndBaseComparatorUnitTest extends BaseTest { - @Test - public void testComparison() { - // desired ordering is by size first, subordered by lexacographic relationship between bases - final List rawStrings = Arrays.asList("A", "C", "AC", "CC", "CT", "AAT", "ACT", "GAT", "ACGT"); - final List lexStrings = new ArrayList<>(rawStrings); - - for ( final List seqs : Utils.makePermutations(lexStrings, lexStrings.size(), false) ) { - final List haps = new ArrayList<>(seqs.size()); - for ( final String seq : seqs ) { - haps.add(new Haplotype(seq.getBytes(), false)); - } - - Collections.sort(haps, new HaplotypeSizeAndBaseComparator()); - for ( int i = 0; i < lexStrings.size(); i++ ) - Assert.assertEquals(haps.get(i).getBaseString(), lexStrings.get(i), "Failed sort " + haps + " expected " + lexStrings); - } - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/LDMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/LDMergerUnitTest.java deleted file mode 100644 index 337a91e44..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/haplotype/LDMergerUnitTest.java +++ /dev/null @@ -1,339 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import htsjdk.samtools.TextCigarCodec; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.TreeSet; - -public class LDMergerUnitTest extends BaseTest { - LDMerger merger; - GenomeLocParser genomeLocParser; - - @BeforeClass - public void init() throws FileNotFoundException { - genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); - } - - @BeforeMethod - public void setUp() throws Exception { - merger = new LDMerger(); - } - - @Test - public void testCreateMergedVariantContext() { - logger.warn("Executing testCreateMergedVariantContext"); - - final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes(); - final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); - - // SNP + SNP = simple MNP - VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make(); - VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make(); - VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + ref + SNP = MNP with ref base gap - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + SNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + insertion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CAAAAA").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + SNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","T").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // SNP + deletion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + deletion = MNP - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + deletion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TAAAAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // insertion + insertion - thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","TA").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","CA").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + deletion - thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); - nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("CG","C").make(); - truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // deletion + insertion (abutting) - thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","A").make(); - nextVC = new VariantContextBuilder().loc("2", 1702, 1702).alleles("T","GCGCGC").make(); - truthVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("AT","AGCGCGC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - - // complex + complex - thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make(); - nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make(); - truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make(); - mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - logger.warn(truthVC + " == " + mergedVC); - Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC)); - Assert.assertEquals(truthVC.getStart(), mergedVC.getStart()); - Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd()); - } - - @Test - public void testInsertionDeletionBecomingNullAllele() { - final byte[] ref = "CAAA".getBytes(); - final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length); - - // insertion + deletion results in a null allele, should return false - final VariantContext thisVC = new VariantContextBuilder().loc("2", 1700, 1701).alleles("CA","C").make(); - final VariantContext nextVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("A","AA").make(); - final VariantContext mergedVC = merger.createMergedVariantContext(thisVC, nextVC, ref, refLoc); - Assert.assertNull(mergedVC, "Insertion deletion becoming a null allele should return a null variant context"); - } - - /** - * Just returns a given R2 value for testing - */ - private static class MockLDCalculator extends HaplotypeLDCalculator { - private final double R2; - - private MockLDCalculator(double r2) { - R2 = r2; - } - - @Override - protected double computeProbOfBeingPhased(VariantContext first, VariantContext second) { - return R2; - } - } - - @DataProvider(name = "R2MergerData") - public Object[][] makeR2MergerData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - final double thres = LDMerger.MERGE_EVENTS_PROB_PHASED_THRESHOLD; - for ( final double r2 : Arrays.asList(0.0, thres - 0.01, thres + 0.01, 1.0) ) { - tests.add(new Object[]{"ACGT", "CCGC", 2, "4M", "ACGT", "CCGC", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "AGGC", 2, "4M", "CGT", "GGC", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "ACCC", 2, "4M", "GT", "CC", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "ACCGTT", 2, "2M1I1M1I1M", "CG", "CCGT", r2, r2 >= thres}); - tests.add(new Object[]{"ACGT", "AGCT", 2, "4M", "CG", "GC", r2, r2 >= thres}); - tests.add(new Object[]{"ACAGT", "AAGC", 2, "1M1D3M", "ACAGT", "AAGC", r2, r2 >= thres}); - tests.add(new Object[]{"ACAGT", "AAT", 2, "1M1D1M1D1M", "ACAG", "AA", r2, r2 >= thres}); - - // cannot be merged -- only 1 event - tests.add(new Object[]{"AAA", "ACA", 1, "3M", null, null, r2, false}); - - final int dist = LDMerger.MAX_DISTANCE_BETWEEN_SNPS_TO_MERGE + 2; - tests.add(new Object[]{Utils.dupString("A", dist), "C" + Utils.dupString("A", dist - 2) + "C", 2, dist + "M", null, null, r2, false}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "R2MergerData") - public void testR2Merger(final String refS, final String hapS, int nEvents, final String cigar, final String expectedMergedRef, final String expectedMergedAlt, final double r2, final boolean expectMerge) { - final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); - final Haplotype hap = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); - - final List haplotypes = Arrays.asList(ref, hap); - final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); - final MockLDCalculator r2Calc = new MockLDCalculator(r2); - - Assert.assertEquals(vcStarts.size(), nEvents); - final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); - Assert.assertEquals(merged, expectMerge); - Assert.assertEquals(vcStarts.size(), expectMerge ? 1 : nEvents); - if ( expectMerge ) { - final VariantContext vc = hap.getEventMap().getVariantContexts().iterator().next(); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertEquals(vc.getReference().getDisplayString(), expectedMergedRef); - Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), expectedMergedAlt); - } - } - - @Test - public void testR2MergerWithThirdHapWithoutEvent() { - final String refS = "ACGT"; - final String hapS = "CCGA"; - final String cigar = "4M"; - final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); - final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - final Haplotype hap2 = new Haplotype("ACGA".getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); - - final List haplotypes = Arrays.asList(ref, hap1, hap2); - final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); - final MockLDCalculator r2Calc = new MockLDCalculator(1.0); - - Assert.assertEquals(vcStarts.size(), 2); - final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); - Assert.assertEquals(merged, true); - Assert.assertEquals(vcStarts.size(), 1); - - final VariantContext vc = hap1.getEventMap().getVariantContexts().iterator().next(); - Assert.assertTrue(vc.isBiallelic()); - Assert.assertEquals(vc.getReference().getDisplayString(), "ACGT"); - Assert.assertEquals(vc.getAlternateAllele(0).getDisplayString(), "CCGA"); - - Assert.assertEquals(hap2.getEventMap().size(), 0); - } - - @Test - public void testR2MergerWithMultipleAllelesAtSites() { - final String refS = "ACGT"; - final String hapS = "TCGA"; - final String cigar = "4M"; - final Haplotype ref = new Haplotype(refS.getBytes(), true, 0, TextCigarCodec.getSingleton().decode(refS.length() + "M")); - final Haplotype hap1 = new Haplotype(hapS.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - - final GenomeLoc loc = new UnvalidatingGenomeLoc("1", 0, 1, ref.length()); - for (final String hap2S : Arrays.asList("GCGA", "TCGG")) { - final Haplotype hap2 = new Haplotype(hap2S.getBytes(), false, 0, TextCigarCodec.getSingleton().decode(cigar)); - - final List haplotypes = Arrays.asList(ref, hap1, hap2); - final TreeSet vcStarts = EventMap.buildEventMapsForHaplotypes(haplotypes, ref.getBases(), loc, false); - final MockLDCalculator r2Calc = new MockLDCalculator(1.0); - - Assert.assertEquals(vcStarts.size(), 2); - final boolean merged = merger.mergeConsecutiveEventsBasedOnLDOnce(haplotypes, r2Calc, 1, vcStarts, ref.getBases(), loc); - Assert.assertEquals(merged, false); - Assert.assertEquals(vcStarts.size(), 2); - } - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerIntegrationTest.java deleted file mode 100644 index f962260e9..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerIntegrationTest.java +++ /dev/null @@ -1,101 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.nanoScheduler; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -// ********************************************************************************** // -// Note that this class also serves as an integration test for the VariantAnnotator! // -// ********************************************************************************** // - -public class NanoSchedulerIntegrationTest extends WalkerTest { - @DataProvider(name = "NanoSchedulerUGTest") - public Object[][] createNanoSchedulerUGTest() { - List tests = new ArrayList(); - - for ( final int nt : Arrays.asList(1, 2) ) - for ( final int nct : Arrays.asList(1, 2) ) { - tests.add(new Object[]{ "BOTH", "18418ddc2bdbe20c38ece6dd18535be7", nt, nct }); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") - private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { - WalkerTestSpec spec = new WalkerTestSpec( - buildCommandLine( - "-T UnifiedGenotyper -R " + b37KGReference, - "--no_cmdline_in_header -G none", - "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", - "-L 20:10,000,000-10,100,000", - "-glm " + glm, - "--contamination_fraction_to_filter 0.0", - "-nt " + nt, - "-nct " + nct, - "-o %s" - ), - 1, - Arrays.asList(md5) - ); - executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); - } - - - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/ActiveRegionTestDataSet.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/ActiveRegionTestDataSet.java deleted file mode 100644 index a41db9386..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/ActiveRegionTestDataSet.java +++ /dev/null @@ -1,593 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.commons.math.distribution.ExponentialDistribution; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyResult; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.AssemblyResultSet; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.Civar; -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** -* Mock-up active region data used in testing. -* -* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> -*/ -public class ActiveRegionTestDataSet { - - private final byte[] referenceBytes; - protected String reference; - protected String[] haplotypeCigars; - protected List haplotypeStrings; - protected String[] readCigars; - protected byte[] bq; - protected byte[] dq; - protected byte[] iq; - protected int kmerSize; - private List haplotypeList; - private List readList; - private AssemblyResultSet assemblyResultSet; - private Map readBySequence; - private String stringRepresentation; - private List> readEventOffsetList; - private GenomeLocParser genomeLocParser; - - /** Create a new active region data test set */ - public ActiveRegionTestDataSet(final int kmerSize, final String reference, final String[] haplotypes, - final String[] readCigars, final byte[] bq, final byte[] dq, final byte[] iq) { - this.reference = reference; - this.referenceBytes = reference.getBytes(); - this.haplotypeCigars = haplotypes; - this.readCigars = readCigars; - this.bq = bq; - this.dq = dq; - this.iq = iq; - this.kmerSize = kmerSize; - this.genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1,1,reference.length()).getSequenceDictionary()); - } - - public String getReference() { - return reference; - } - - public String toString() { - if (stringRepresentation == null) - return super.toString(); - else return stringRepresentation; - } - - public AssemblyResultSet assemblyResultSet() { - if (assemblyResultSet == null) { - final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); - rtg.addSequence("anonymous", this.getReference().getBytes(), true); - for (final String haplotype : this.haplotypesStrings()) { - rtg.addSequence("anonymous", haplotype.getBytes(), false); - } - rtg.buildGraphIfNecessary(); - if (rtg.hasCycles()) - throw new RuntimeException("there is cycles in the reference with kmer size " + kmerSize + ". Don't use this size for the benchmark or change the reference"); - - List haplotypeList = this.haplotypeList(); - - assemblyResultSet = new AssemblyResultSet(); - final AssemblyResult ar = new AssemblyResult((haplotypeList.size() > 1 ? - AssemblyResult.Status.ASSEMBLED_SOME_VARIATION : AssemblyResult.Status.JUST_ASSEMBLED_REFERENCE),rtg.convertToSequenceGraph()); - ar.setThreadingGraph(rtg); - - for (final Haplotype h : haplotypeList) - assemblyResultSet.add(h, ar); - } - return assemblyResultSet; - } - - public List haplotypesStrings() { - if (haplotypeStrings != null) { - return haplotypeStrings; - } - final List result = new ArrayList<>(haplotypeCigars.length); - String reference = this.reference; - for (final String cigar : haplotypeCigars) { - if (cigar.matches("^Civar:.*$")) { - stringRepresentation = cigar.substring(6); - result.addAll(expandAllCombinations(cigar.substring(6),reference)); - } else if (cigar.matches("^.*\\d+.*$")) { - result.add(applyCigar(reference, cigar,0,true)); - } else { - result.add(cigar); - } - } - haplotypeStrings = result; - return result; - } - - private List expandAllCombinations(final String cigarString, final String reference) { - final Civar civar = Civar.fromCharSequence(cigarString); - final List unrolledCivars = civar.optionalizeAll().unroll(); - List result = new ArrayList<>(unrolledCivars.size()); - for (final Civar c : unrolledCivars) { - result.add(c.applyTo(reference)); - } - return result; - } - - private List expandAllHaplotypeCombinations(final String civarString, final String reference) { - final Civar civar = Civar.fromCharSequence(civarString); - final List unrolledCivars = civar.optionalizeAll().unroll(); - List result = new ArrayList<>(unrolledCivars.size()); - for (final Civar c : unrolledCivars) { - final String baseString = c.applyTo(reference); - final Haplotype haplotype = new Haplotype(baseString.getBytes(),baseString.equals(reference)); - haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - try { - haplotype.setCigar(c.toCigar(reference.length())); - } catch (final RuntimeException ex) { - c.applyTo(reference); - c.toCigar(reference.length()); - throw new RuntimeException("" + c + " " + ex.getMessage(),ex); - } - result.add(haplotype); - } - return result; - } - - - public List haplotypeList() { - if (haplotypeList == null) { - - final List result = new ArrayList<>(haplotypeCigars.length); - final String reference = this.reference; - for (final String cigar : haplotypeCigars) { - if (cigar.matches("^Civar:.*$")) { - stringRepresentation = cigar.substring(6); - result.addAll(expandAllHaplotypeCombinations(cigar.substring(6), reference)); - } else if (cigar.matches("^.*\\d+.*$")) { - result.add(cigarToHaplotype(reference, cigar, 0, true)); - } else { - final Haplotype h = new Haplotype(cigar.getBytes()); - h.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - result.add(h); - } - } - haplotypeList = result; - } - return haplotypeList; - } - - - protected SAMSequenceDictionary artificialSAMSequenceDictionary() { - return new SAMSequenceDictionary(Collections.singletonList(new SAMSequenceRecord("00",reference.length()))); - } - - protected SAMFileHeader artificialSAMFileHeader() { - return ArtificialSAMUtils.createArtificialSamHeader(artificialSAMSequenceDictionary()); - } - - public List readList() { - if (readList == null) { - final SAMFileHeader header = artificialSAMFileHeader(); - readList = new ArrayList<>(readCigars.length); - final List haplotypes = haplotypesStrings(); - int count = 0; - for (final String descr : readCigars) { - String sequence; - if (descr.matches("^\\d+:\\d+:.+$")) { - final String[] parts = descr.split(":"); - int allele = Integer.valueOf(parts[0]); - int offset = Integer.valueOf(parts[1]); - final String cigar = parts[2]; - final String base = allele == 0 ? reference : haplotypes.get(allele - 1); - sequence = applyCigar(base, cigar, offset, false); - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); - readList.add(new MyGATKSAMRecord(samRecord)); - } else if (descr.matches("^\\*:\\d+:\\d+$")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - readList.addAll(generateSamRecords(haplotypes, readCount, readLength, header, count)); - } else { - sequence = descr; - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header, "read_" + count, 0, 1, sequence.getBytes(), Arrays.copyOf(bq, sequence.length())); - readList.add(new MyGATKSAMRecord(samRecord)); - } - count = readList.size(); - } - } - return readList; - } - - public List> readEventOffsetList() { - if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) - throw new UnsupportedOperationException(); - if (readEventOffsetList == null) { - final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); - final List unrolledCivars = civar.optionalizeAll().unroll(); - - readEventOffsetList = new ArrayList<>(readCigars.length); - int count = 0; - for (final String descr : readCigars) { - if (descr.matches("^\\d+:\\d+:.+$")) { - throw new UnsupportedOperationException(); - } else if (descr.matches("^\\*:\\d+:\\d+$")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - readEventOffsetList.addAll(generateElementOffsetRecords(haplotypesStrings(), unrolledCivars, readCount, readLength, count)); - } else { - throw new UnsupportedOperationException(); - } - count = readEventOffsetList.size(); - } - readEventOffsetList = Collections.unmodifiableList(readEventOffsetList); - } - return readEventOffsetList; - } - - - - - @SuppressWarnings("unused") - public String cigarToSequence(final String cigar) { - String reference = this.reference; - return applyCigar(reference, cigar,0,true); - } - - @SuppressWarnings("unused") - public GATKSAMRecord readFromString(final String readSequence) { - if (readBySequence == null) { - final List readList = readList(); - readBySequence = new HashMap<>(readList.size()); - for (final GATKSAMRecord r : readList) - readBySequence.put(r.getReadString(),r); - } - return readBySequence.get(readSequence); - } - - public List unrolledCivars() { - if (haplotypeCigars.length != 1 || !haplotypeCigars[0].startsWith("Civar:")) - throw new UnsupportedOperationException(); - final Civar civar = Civar.fromCharSequence(haplotypeCigars[0].substring(6)); - return civar.optionalizeAll().unroll(); - } - - public void introduceErrors(final Random rnd) { - final List reads = readList(); - final ArrayList result = new ArrayList<>(reads.size()); - for (final GATKSAMRecord read : reads) { - result.add(new MyGATKSAMRecord(read,rnd)); - } - readList = result; - } - - private class MyGATKSAMRecord extends GATKSAMRecord { - protected MyGATKSAMRecord(final GATKSAMRecord r) { - super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), - (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), - r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), - new byte[0]); - this.setReadBases(r.getReadBases()); - this.setBaseQualities(r.getBaseQualities()); - this.setReadName(r.getReadName()); - } - - ExponentialDistribution indelLengthDist = MathUtils.exponentialDistribution(1.0 / 0.9); - - public MyGATKSAMRecord(final GATKSAMRecord r, final Random rnd) { - super(r.getHeader(), r.getReferenceIndex(), r.getAlignmentStart(), (short) r.getReadNameLength(), - (short) 100, -1, r.getCigarLength(), r.getFlags(), r.getReadLength(), - r.getMateReferenceIndex(), r.getMateAlignmentStart(), r.getInferredInsertSize(), - new byte[0]); - final byte[] bases = new byte[r.getReadBases().length]; - - final byte[] readBases = r.getReadBases(); - final byte[] bq = r.getBaseQualities(); - final byte[] iq = r.getBaseInsertionQualities(); - final byte[] dq = r.getBaseDeletionQualities(); - int refOffset = r.getAlignmentStart() - 1; - int readOffset = 0; - for (int i = 0; i < r.getReadBases().length;) { - double p = rnd.nextDouble(); - double iqp = QualityUtils.qualToErrorProb(iq[i]); - if (p < iqp) { // insertion - final int length = Math.min(generateIndelLength(rnd),r.getReadBases().length - i); - final int refStart = rnd.nextInt(reference.length() - length); - System.arraycopy(referenceBytes,refStart,bases,i,length); - i += length; - continue; - } - p -= iqp; - double dqp = QualityUtils.qualToErrorProb(dq[i]); - if (p < dqp) { - final int length = generateIndelLength(rnd); - refOffset += length; - refOffset = refOffset % referenceBytes.length; - readOffset += length; - continue; - } - p -= dqp; - double bqp = QualityUtils.qualToErrorProb(bq[i]); - byte b = readOffset < readBases.length ? readBases[readOffset] : referenceBytes[refOffset]; - byte nb; - if (p < bqp) { - switch (b) { - case 'A': nb = 'C'; break; - case 'T': nb = 'A'; break; - case 'C': nb = 'G'; break; - case 'G': nb = 'B'; break; - default: nb = 'A'; - } - } else - nb = b; - - bases[i++] = nb; - refOffset++; - refOffset = refOffset % referenceBytes.length; - readOffset++; - } - this.setReadBases(bases); - this.setBaseQualities(r.getBaseQualities()); - this.setReadName(r.getReadName()); - - - } - - private int generateIndelLength(final Random rnd) { - final int length; - try { - length = (int) Math.round(indelLengthDist.inverseCumulativeProbability(rnd.nextDouble()) + 1); - } catch (Exception e) { - throw new RuntimeException(e); - } - return length; - } - - @Override - public byte[] getBaseDeletionQualities() { - return Arrays.copyOf(dq,getReadLength()); - } - - @Override - public byte[] getBaseInsertionQualities() { - return Arrays.copyOf(iq,getReadLength()); - } - - @Override - public int getMappingQuality() { - return 100; - } - - @Override - public int hashCode() { - return getReadName().hashCode(); - } - - @Override - public boolean equals(Object o) { - if (o instanceof GATKSAMRecord) { - return getReadName().equals(((GATKSAMRecord)o).getReadName()); - } else { - return false; - } - } - - public String toString() { - return super.toString() + " " + this.getReadString(); - } - } - - - public List readStrings() { - final List result = new ArrayList<>(readCigars.length); - final List haplotypes = haplotypesStrings(); - for (final String descr : readCigars) { - String sequence; - if (descr.matches("^\\d+:\\d+:.+$")) { - final String[] parts = descr.split(":"); - int allele = Integer.valueOf(parts[0]); - int offset = Integer.valueOf(parts[1]); - final String cigar = parts[2]; - final String base = allele == 0 ? reference : haplotypes.get(allele - 1); - sequence = applyCigar(base, cigar, offset, false); - result.add(sequence); - } else if (descr.matches("\\*:^\\d+:\\d+")) { - int readCount = Integer.valueOf(descr.split(":")[1]); - int readLength = Integer.valueOf(descr.split(":")[2]); - result.addAll(generateReads(haplotypes, readCount, readLength)); - } else { - sequence = descr; - result.add(sequence); - } - } - return result; - } - - private List generateReads(final List haplotypes, final int readCount, final int readLength) { - final List result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % haplotypes.size(); - final String h = haplotypes.get(hi); - int offset = i % h.length() - readLength; - result.add(h.substring(offset,offset + readLength)); - } - return result; - } - - private List generateSamRecords(final List haplotypes, final int readCount, final int readLength, final SAMFileHeader header, final int idStart) { - int id = idStart; - final List result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % haplotypes.size(); - final String h = haplotypes.get(hi); - int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); - int to = Math.min(h.length(),offset + readLength); - byte[] bases = h.substring(offset,to).getBytes(); - byte[] quals = Arrays.copyOf(bq,to - offset); - final GATKSAMRecord samRecord = ArtificialSAMUtils.createArtificialRead(header,"read_" + id++,0,offset + 1,bases, quals); - result.add(new MyGATKSAMRecord(samRecord)); - } - return result; - } - - - private List> generateElementOffsetRecords(final List haplotypes, final List unrolledCivars, final int readCount, final int readLength, final int count) { - - final List> result = new ArrayList<>(readCount); - for (int i = 0; i < readCount; i++) { - int hi = i % unrolledCivars.size(); - final Civar c = unrolledCivars.get(hi); - final String h = haplotypes.get(hi); - int offset = h.length() <= readLength ? 0 : i % (h.length() - readLength); - int to = Math.min(h.length(),offset + readLength); - result.add(c.eventOffsets(reference,offset,to)); - } - return result; - } - - private static final Pattern cigarPattern = Pattern.compile("(\\d+)([=A-Z])"); - - - private Haplotype cigarToHaplotype(final String reference, final String cigar, final int offset, final boolean global) { - final String sequence = applyCigar(reference,cigar,offset,global); - final Haplotype haplotype = new Haplotype(sequence.getBytes(),reference.equals(sequence)); - haplotype.setGenomeLocation(genomeLocParser.createGenomeLoc("chr1",1,reference.length())); - haplotype.setCigar(Civar.fromCharSequence(cigar).toCigar(reference.length())); - return haplotype; - } - - private String applyCigar(final String reference, final String cigar, final int offset, final boolean global) { - final Matcher pm = cigarPattern.matcher(cigar); - StringBuffer sb = new StringBuffer(); - int index = offset; - while (pm.find()) { - int length = Integer.valueOf(pm.group(1)); - char operator = pm.group(2).charAt(0); - switch (operator) { - case '=' : - try { - sb.append(reference.substring(index, index + length)); - } catch (Exception e) { - throw new RuntimeException(" " + index + " " + (index + length) + " " + reference.length() + " " + cigar,e); - } - index += length; break; - case 'D' : - index += length; break; - case 'I' : - String insert = cigar.substring(pm.end(),pm.end() + length).toUpperCase(); - sb.append(insert); break; - case 'V' : - sb.append(transversionV(reference.charAt(index))); index++; break; - case 'W' : - sb.append(transversionW(reference.charAt(index))); index++; break; - case 'T' : - sb.append(transition(reference.charAt(index))); index++; break; - default: - throw new UnsupportedOperationException("cigar operator " + operator + " not supported."); - } - } - if (global && index != reference.length()) { - throw new RuntimeException(" haplotype cigar does not explain reference length (" + index + " != " + reference.length() + ") on cigar " + cigar); - } else if (index > reference.length()) { - throw new RuntimeException(" index beyond end "); - } - return sb.toString(); - } - - protected int kmerSize() { - return kmerSize; - } - - private char transversionV(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'C'; - case 'G': return 'T'; - case 'C': return 'A'; - case 'T': return 'G'; - default: - return c; - } - - } - - private char transversionW(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'T'; - case 'G': return 'C'; - case 'T': return 'A'; - case 'C': return 'G'; - default: - return c; - } - - } - - private char transition(final char c) { - switch (Character.toUpperCase(c)) { - case 'A': return 'G'; - case 'G': return 'A'; - case 'T': return 'C'; - case 'C': return 'T'; - default: - return c; - } - - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMMUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMMUnitTest.java deleted file mode 100644 index 621ef7b1f..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/FastLoglessPairHMMUnitTest.java +++ /dev/null @@ -1,182 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import org.broadinstitute.gatk.tools.walkers.haplotypecaller.ActiveRegionTestDataSetUnitTest; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.*; - - -/** - * Created with IntelliJ IDEA. - * User: valentin - * Date: 10/13/13 - * Time: 12:55 PM - * To change this template use File | Settings | File Templates. - */ -public class FastLoglessPairHMMUnitTest extends ActiveRegionTestDataSetUnitTest { - - private FastLoglessPairHMM unsorted = new FastLoglessPairHMM((byte)10); - private FastLoglessPairHMM sorted = new FastLoglessPairHMM((byte)10); - - @Test(enabled=false,dataProvider="activeRegionTestDataSets") - public void testActiveRegionsDataSet(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { - - } - - @Test(enabled=true,dataProvider="activeRegionTestDataSets") - public void testHaplotypeGrouped(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { - final List reads = as.readList(); - final List haplotypes = as.haplotypeList(); - PairHMMReadyHaplotypes haplotypeCollection = new PairHMMReadyHaplotypes(haplotypes.size()); - final List sortedHaplotypes = new ArrayList<>(haplotypes); - Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); - Map basesToPos = new HashMap<>(sortedHaplotypes.size()); - int nextIdx = 0; - - for (final Haplotype h : sortedHaplotypes) { - final byte[] bases = h.getBases(); - haplotypeCollection.add(bases); - basesToPos.put(bases,nextIdx++); - } - for (GATKSAMRecord read : reads) { - final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; - final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; - unsorted.loadRead(read); - sorted.loadRead(read); - final Map unsortedResults = new HashMap<>(haplotypes.size()); - for (int i = 0; i < sortedHaplotypes.size(); i++) { - final Haplotype h = sortedHaplotypes.get(i); - final byte[] haplotypeBases = h.getBases().clone(); - unsorted.loadHaplotypeBases(haplotypeBases); - double lk = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); - unsortedLikelihoods[i] = lk; - } - sorted.calculateLocalLikelihoods(0, read.getReadLength(), haplotypeCollection); - for (final PairHMMReadyHaplotypes.Entry entry : haplotypeCollection) { - final byte[] bases = entry.getBases(); - final double lk = entry.getLikelihood(); - final int haplotypePos = basesToPos.get(bases); - sortedLikelihoods[haplotypePos] = lk; - } - for (int i = 0; i < unsortedLikelihoods.length; i++) - Assert.assertEquals(unsortedLikelihoods[i],sortedLikelihoods[i],0.00000001,Arrays.toString(unsortedLikelihoods) + Arrays.toString(sortedLikelihoods)); - } - } - - @Test(enabled=true,dataProvider="activeRegionTestDataSets") - public void testSortedVsUnsorted(final ActiveRegionTestDataSet as, final int kmerSize, final int readLength, final String variation, final int readCount, final int regionSize, final byte bq, final byte iq, final byte dq) { - final List reads = as.readList(); - final List haplotypes = as.haplotypeList(); - final List sortedHaplotypes = new ArrayList<>(haplotypes); - Collections.sort(sortedHaplotypes, HAPLOTYPE_COMPARATOR); - - byte[] lastHaplotypeBases = null; - for (GATKSAMRecord read : reads) { - final double[] unsortedLikelihoods = new double[sortedHaplotypes.size()]; - final double[] sortedLikelihoods = new double[sortedHaplotypes.size()]; - unsorted.loadRead(read); - sorted.loadRead(read); - for (int i = 0; i < sortedHaplotypes.size(); i++) { - final Haplotype h = sortedHaplotypes.get(i); - final byte[] haplotypeBases = h.getBases().clone(); - final byte[] haplotypeBases2 = haplotypeBases.clone(); - int commonPrefixEnd = 0; - - - if (lastHaplotypeBases != null) { - final int prefixEndLimit = Math.min(lastHaplotypeBases.length,haplotypeBases.length); - for (commonPrefixEnd = 0; commonPrefixEnd < prefixEndLimit; commonPrefixEnd++) - if (lastHaplotypeBases[commonPrefixEnd] != haplotypeBases[commonPrefixEnd]) - break; - } - - unsorted.loadHaplotypeBases(haplotypeBases); - sorted.changeHaplotypeSuffix(commonPrefixEnd, haplotypeBases, commonPrefixEnd, haplotypeBases.length); - Assert.assertTrue(Arrays.equals(haplotypeBases2, unsorted.getHaplotypeBases())); - Assert.assertTrue(Arrays.equals(haplotypeBases2, sorted.getHaplotypeBases())); - unsortedLikelihoods[i] = unsorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); - sortedLikelihoods[i] = sorted.calculateLocalLikelihood(0, read.getReadLength(), 0, haplotypeBases.length, false); - Assert.assertTrue(Arrays.equals(haplotypeBases2,unsorted.getHaplotypeBases())); - Assert.assertTrue(Arrays.equals(haplotypeBases2,sorted.getHaplotypeBases())); - Assert.assertEquals((double)unsortedLikelihoods[i], (double) sortedLikelihoods[i],0.00000001); - lastHaplotypeBases = haplotypeBases; - } - } - } - - public static final Comparator HAPLOTYPE_COMPARATOR = new Comparator() { - - @Override - public int compare(final Haplotype o1, final Haplotype o2) { - if (o1 == o2) - return 0; - final byte[] bases1 = o1.getBases(); - final byte[] bases2 = o2.getBases(); - final int ilimit = Math.min(bases1.length,bases2.length); - for (int i = 0; i < ilimit; i++) { - final int cmp = Byte.compare(bases1[i],bases2[i]); - if (cmp != 0) return cmp; - } - if (bases1.length == bases2.length) return 0; - return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. - } - }; - - - - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java deleted file mode 100644 index b7d0b037e..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMProbabilityBugIntegrationTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; - -/** - * Test for the Prob > 1 bug in PairHMM using callers. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class PairHMMProbabilityBugIntegrationTest extends WalkerTest { - - private static final File REFERENCE = new File("/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta").getAbsoluteFile(); - private static final File BAM = new File (privateTestDir, "pairhmm_prob_bug.bam").getAbsoluteFile(); - private static final File INTERVAL = new File (privateTestDir, "pairhmm_prob_bug.interval.bed").getAbsoluteFile(); - - private static final File UG_BAM = new File(privateTestDir, "pairhmm_prob_bug.ug.bam").getAbsoluteFile(); - private static final File UG_INTERVAL = new File(privateTestDir, "pairhmm_prob_bug.ug.intervals.bed").getAbsoluteFile(); - - - @Test - public void testHaplotypeCaller() { - final String commandLine = String.format("-T HaplotypeCaller -R %s -I %s -L %s", - REFERENCE,BAM,INTERVAL); - final String name = getClass().getSimpleName() + ".testHaplotypeCaller"; - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); - executeTest(name, spec); - } - - @Test - public void testUnifiedGenotyper() { - final String commandLine = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -dcov 200 -glm INDEL", - REFERENCE,UG_BAM,UG_INTERVAL); - final String name = getClass().getSimpleName() + ".testUnifiedGenotyper"; - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("")); - executeTest(name, spec); - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java index 5643e3030..1c34405b0 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMUnitTest.java @@ -55,7 +55,6 @@ package org.broadinstitute.gatk.utils.pairhmm; // the imports for unit testing. import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.MathUtils; import org.broadinstitute.gatk.utils.QualityUtils; @@ -252,8 +251,8 @@ public class PairHMMUnitTest extends BaseTest { @DataProvider(name = "OptimizedLikelihoodTestProvider") public Object[][] makeOptimizedLikelihoodTests() { - GenomeAnalysisEngine.resetRandomGenerator(); - final Random random = GenomeAnalysisEngine.getRandomGenerator(); + Utils.resetRandomGenerator(); + final Random random = Utils.getRandomGenerator(); final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30); final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40); final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ContextCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ContextCovariateUnitTest.java deleted file mode 100644 index 188902bb5..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ContextCovariateUnitTest.java +++ /dev/null @@ -1,122 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.ContextCovariate; -import org.broadinstitute.gatk.utils.recalibration.covariates.Covariate; -import org.broadinstitute.gatk.utils.clipping.ClippingRepresentation; -import org.broadinstitute.gatk.utils.clipping.ReadClipper; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ContextCovariateUnitTest { - ContextCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ContextCovariate(); - covariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = true) - public void testSimpleContexts() { - GATKSAMRecord read = ReadUtils.createRandomRead(1000); - GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, RAC.LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - - verifyCovariateArray(readCovariates.getMismatchesKeySet(), RAC.MISMATCHES_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getInsertionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - verifyCovariateArray(readCovariates.getDeletionsKeySet(), RAC.INDELS_CONTEXT_SIZE, clippedRead, covariate); - } - - public static void verifyCovariateArray(int[][] values, int contextSize, GATKSAMRecord read, Covariate contextCovariate) { - for (int i = 0; i < values.length; i++) - Assert.assertEquals(contextCovariate.formatKey(values[i][0]), expectedContext(read, i, contextSize)); - - } - - public static String expectedContext (GATKSAMRecord read, int offset, int contextSize) { - final String bases = stringFrom(read.getReadBases()); - String expectedContext = null; - if (offset - contextSize + 1 >= 0) { - String context = bases.substring(offset - contextSize + 1, offset + 1); - if (!context.contains("N")) - expectedContext = context; - } - return expectedContext; - } - - private static String stringFrom(byte[] array) { - String s = ""; - for (byte value : array) - s += (char) value; - return s; - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/CycleCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/CycleCovariateUnitTest.java deleted file mode 100644 index 316c28374..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/CycleCovariateUnitTest.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.recalibration.covariates.CycleCovariate; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class CycleCovariateUnitTest { - CycleCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new CycleCovariate(); - covariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = true) - public void testSimpleCycles() { - short readLength = 10; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), 1, (short) 1); - - read.setReadNegativeStrandFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), readLength, -1); - - read.setSecondOfPairFlag(true); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -readLength, 1); - - read.setReadNegativeStrandFlag(false); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), -1, -1); - } - - private void verifyCovariateArray(int[][] values, int init, int increment) { - for (short i = 0; i < values.length; i++) { - short actual = Short.decode(covariate.formatKey(values[i][0])); - int expected = init + (increment * i); - Assert.assertEquals(actual, expected); - } - } - - @Test(enabled = true, expectedExceptions={UserException.class}) - public void testMoreThanMaxCycleFails() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } - - @Test(enabled = true) - public void testMaxCyclePasses() { - int readLength = RAC.MAXIMUM_CYCLE_VALUE; - GATKSAMRecord read = ReadUtils.createRandomRead(readLength); - read.setReadPairedFlag(true); - read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); - read.getReadGroup().setPlatform("illumina"); - - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizerUnitTest.java deleted file mode 100644 index 90bad890e..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/QualQuantizerUnitTest.java +++ /dev/null @@ -1,195 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -public class QualQuantizerUnitTest extends BaseTest { - @BeforeSuite - public void before() { - - } - - // -------------------------------------------------------------------------------- - // - // merge case Provider - // - // -------------------------------------------------------------------------------- - - private class QualIntervalTestProvider extends TestDataProvider { - final QualQuantizer.QualInterval left, right; - int exError, exTotal, exQual; - double exErrorRate; - - private QualIntervalTestProvider(int leftE, int leftN, int rightE, int rightN, int exError, int exTotal) { - super(QualIntervalTestProvider.class); - - QualQuantizer qq = new QualQuantizer(0); - left = qq.new QualInterval(10, 10, leftN, leftE, 0); - right = qq.new QualInterval(11, 11, rightN, rightE, 0); - - this.exError = exError; - this.exTotal = exTotal; - this.exErrorRate = (leftE + rightE + 1) / (1.0 * (leftN + rightN + 1)); - this.exQual = QualityUtils.errorProbToQual(this.exErrorRate); - } - } - - @DataProvider(name = "QualIntervalTestProvider") - public Object[][] makeQualIntervalTestProvider() { - new QualIntervalTestProvider(10, 100, 10, 1000, 20, 1100); - new QualIntervalTestProvider(0, 100, 10, 900, 10, 1000); - new QualIntervalTestProvider(10, 900, 0, 100, 10, 1000); - new QualIntervalTestProvider(0, 0, 10, 100, 10, 100); - new QualIntervalTestProvider(1, 10, 9, 90, 10, 100); - new QualIntervalTestProvider(1, 10, 9, 100000, 10, 100010); - new QualIntervalTestProvider(1, 10, 9, 1000000, 10,1000010); - - return QualIntervalTestProvider.getTests(QualIntervalTestProvider.class); - } - - @Test(dataProvider = "QualIntervalTestProvider") - public void testQualInterval(QualIntervalTestProvider cfg) { - QualQuantizer.QualInterval merged = cfg.left.merge(cfg.right); - Assert.assertEquals(merged.nErrors, cfg.exError); - Assert.assertEquals(merged.nObservations, cfg.exTotal); - Assert.assertEquals(merged.getErrorRate(), cfg.exErrorRate); - Assert.assertEquals(merged.getQual(), cfg.exQual); - } - - @Test - public void testMinInterestingQual() { - for ( int q = 0; q < 15; q++ ) { - for ( int minQual = 0; minQual <= 10; minQual ++ ) { - QualQuantizer qq = new QualQuantizer(minQual); - QualQuantizer.QualInterval left = qq.new QualInterval(q, q, 100, 10, 0); - QualQuantizer.QualInterval right = qq.new QualInterval(q+1, q+1, 1000, 100, 0); - - QualQuantizer.QualInterval merged = left.merge(right); - boolean shouldBeFree = q+1 <= minQual; - if ( shouldBeFree ) - Assert.assertEquals(merged.getPenalty(), 0.0); - else - Assert.assertTrue(merged.getPenalty() > 0.0); - } - } - } - - - // -------------------------------------------------------------------------------- - // - // High-level case Provider - // - // -------------------------------------------------------------------------------- - - private class QuantizerTestProvider extends TestDataProvider { - final List nObservationsPerQual = new ArrayList(); - final int nLevels; - final List expectedMap; - - private QuantizerTestProvider(final List nObservationsPerQual, final int nLevels, final List expectedMap) { - super(QuantizerTestProvider.class); - - for ( int x : nObservationsPerQual ) - this.nObservationsPerQual.add((long)x); - this.nLevels = nLevels; - this.expectedMap = expectedMap; - } - - @Override - public String toString() { - return String.format("QQTest nLevels=%d nObs=[%s] map=[%s]", - nLevels, Utils.join(",", nObservationsPerQual), Utils.join(",", expectedMap)); - } - } - - @DataProvider(name = "QuantizerTestProvider") - public Object[][] makeQuantizerTestProvider() { - List allQ2 = Arrays.asList(0, 0, 1000, 0, 0); - - new QuantizerTestProvider(allQ2, 5, Arrays.asList(0, 1, 2, 3, 4)); - new QuantizerTestProvider(allQ2, 1, Arrays.asList(2, 2, 2, 2, 2)); - - new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 0, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); - new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 1, 1000), 2, Arrays.asList(2, 2, 2, 4, 4)); - new QuantizerTestProvider(Arrays.asList(0, 0, 1000, 10, 1000), 2, Arrays.asList(2, 2, 2, 2, 4)); - - return QuantizerTestProvider.getTests(QuantizerTestProvider.class); - } - - @Test(dataProvider = "QuantizerTestProvider", enabled = true) - public void testQuantizer(QuantizerTestProvider cfg) { - QualQuantizer qq = new QualQuantizer(cfg.nObservationsPerQual, cfg.nLevels, 0); - logger.warn("cfg: " + cfg); - for ( int i = 0; i < cfg.expectedMap.size(); i++) { - int expected = cfg.expectedMap.get(i); - int observed = qq.originalToQuantizedMap.get(i); - //logger.warn(String.format(" qq map: %s : %d => %d", i, expected, observed)); - Assert.assertEquals(observed, expected); - } - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariatesUnitTest.java deleted file mode 100644 index b765e4d5b..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadCovariatesUnitTest.java +++ /dev/null @@ -1,148 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.Random; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class ReadCovariatesUnitTest { - - @BeforeMethod - public void init() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = false) - public void testCovariateGeneration() { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final String RGID = "id"; - - ReadGroupCovariate rgCov = new ReadGroupCovariate(); - QualityScoreCovariate qsCov = new QualityScoreCovariate(); - ContextCovariate coCov = new ContextCovariate(); - CycleCovariate cyCov = new CycleCovariate(); - - rgCov.initialize(RAC); - qsCov.initialize(RAC); - coCov.initialize(RAC); - cyCov.initialize(RAC); - - Covariate[] requestedCovariates = new Covariate[4]; - requestedCovariates[0] = rgCov; - requestedCovariates[1] = qsCov; - requestedCovariates[2] = coCov; - requestedCovariates[3] = cyCov; - - final int NUM_READS = 100; - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - final String[] readGroups = {"RG1", "RG2", "RGbla"}; - for (int idx = 0; idx < NUM_READS; idx++) { - for (final String rgs : readGroups) { - final int length = 10 + rnd.nextInt(100); // random read length, at least 10 bp long - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(rgs); - rg.setPlatform("illumina"); - read.setReadGroup(rg); - read.setReadNegativeStrandFlag(rnd.nextBoolean()); - final byte[] mQuals = read.getBaseQualities(EventType.BASE_SUBSTITUTION); - final byte[] iQuals = read.getBaseQualities(EventType.BASE_INSERTION); - final byte[] dQuals = read.getBaseQualities(EventType.BASE_DELETION); - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, length); - Assert.assertEquals(rc.getInsertionsKeySet().length, length); - Assert.assertEquals(rc.getDeletionsKeySet().length, length); - - for (int i = 0; i < length; i++) { - // check that read group is always the same - Assert.assertEquals(rgCov.formatKey(rc.getMismatchesKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getInsertionsKeySet(i)[0]), rgs); - Assert.assertEquals(rgCov.formatKey(rc.getDeletionsKeySet(i)[0]), rgs); - - // check quality score - Assert.assertEquals(qsCov.formatKey(rc.getMismatchesKeySet(i)[1]), "" + mQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getInsertionsKeySet(i)[1]), "" + iQuals[i]); - Assert.assertEquals(qsCov.formatKey(rc.getDeletionsKeySet(i)[1]), "" + dQuals[i]); - - // check context - Assert.assertEquals(coCov.formatKey(rc.getMismatchesKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.MISMATCHES_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getInsertionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - Assert.assertEquals(coCov.formatKey(rc.getDeletionsKeySet(i)[2]), ContextCovariateUnitTest.expectedContext(read, i, RAC.INDELS_CONTEXT_SIZE)); - - // check cycle - Assert.assertEquals(cyCov.formatKey(rc.getMismatchesKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getInsertionsKeySet(i)[3]), "" + (i+1)); - Assert.assertEquals(cyCov.formatKey(rc.getDeletionsKeySet(i)[3]), "" + (i+1)); - } - - } - - } - - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadGroupCovariateUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadGroupCovariateUnitTest.java deleted file mode 100644 index 47bbf38a4..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/ReadGroupCovariateUnitTest.java +++ /dev/null @@ -1,126 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.ReadGroupCovariate; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -/** - * @author Mauricio Carneiro - * @since 3/1/12 - */ -public class ReadGroupCovariateUnitTest { - ReadGroupCovariate covariate; - RecalibrationArgumentCollection RAC; - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - covariate = new ReadGroupCovariate(); - covariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - @Test(enabled = true) - public void testSingleRecord() { - final String expected = "SAMPLE.1"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("MY.ID"); - rg.setPlatformUnit(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testMissingPlatformUnit() { - final String expected = "MY.7"; - GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(expected); - runTest(rg, expected, covariate); - } - - @Test(enabled = true) - public void testForceReadgroup() { - final RecalibrationArgumentCollection forcedRAC = new RecalibrationArgumentCollection(); - forcedRAC.FORCE_READGROUP = "FOO"; - final ReadGroupCovariate forcedCovariate = new ReadGroupCovariate(); - forcedCovariate.initialize(forcedRAC); - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("NOT_FOO"); - runTest(rg, "FOO", forcedCovariate); - } - - private static void runTest(final GATKSAMReadGroupRecord rg, final String expected, final ReadGroupCovariate covariate) { - GATKSAMRecord read = ReadUtils.createRandomRead(10); - read.setReadGroup(rg); - ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); - covariate.recordValues(read, readCovariates); - verifyCovariateArray(readCovariates.getMismatchesKeySet(), expected, covariate); - - } - - private static void verifyCovariateArray(final int[][] values, final String expected, final ReadGroupCovariate covariate) { - for (int[] value : values) { - String actual = covariate.formatKey(value[0]); - Assert.assertEquals(actual, expected); - } - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumUnitTest.java deleted file mode 100644 index 3c3842f70..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalDatumUnitTest.java +++ /dev/null @@ -1,313 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - - -// the imports for unit testing. - - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.Collections; - - -public class RecalDatumUnitTest extends BaseTest { - - // -------------------------------------------------------------------------------- - // - // merge case Provider - // - // -------------------------------------------------------------------------------- - - private class RecalDatumTestProvider extends TestDataProvider { - int exError, exTotal, reportedQual; - - private RecalDatumTestProvider(int E, int N, int reportedQual) { - super(RecalDatumTestProvider.class); - - this.exError = E; - this.exTotal = N; - this.reportedQual = reportedQual; - } - - public double getErrorRate() { - return (exError + 1) / (1.0 * (exTotal + 2)); - } - - public double getErrorRatePhredScaled() { - return QualityUtils.phredScaleErrorRate(getErrorRate()); - } - - public int getReportedQual() { - return reportedQual; - } - - public RecalDatum makeRecalDatum() { - return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual()); - } - - @Override - public String toString() { - return String.format("exError=%d, exTotal=%d, reportedQual=%d", exError, exTotal, reportedQual); - } - } - - private static boolean createdDatumTestProviders = false; - - @DataProvider(name = "RecalDatumTestProvider") - public Object[][] makeRecalDatumTestProvider() { - if ( !createdDatumTestProviders ) { - for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) - for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) - for ( int reportedQual : Arrays.asList(10, 20) ) - if ( E <= N ) - new RecalDatumTestProvider(E, N, reportedQual); - createdDatumTestProviders = true; - } - - return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class); - } - - @Test(dataProvider = "RecalDatumTestProvider") - public void testRecalDatumBasics(RecalDatumTestProvider cfg) { - final RecalDatum datum = cfg.makeRecalDatum(); - assertBasicFeaturesOfRecalDatum(datum, cfg); - } - - private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) { - Assert.assertEquals(datum.getNumMismatches(), cfg.exError, 1E-6); - Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); - if ( cfg.getReportedQual() != -1 ) - Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); - BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate()); - - final double e = datum.getEmpiricalQuality(); - Assert.assertTrue(datum.getEmpiricalQualityAsByte() >= Math.floor(e)); - Assert.assertTrue(datum.getEmpiricalQualityAsByte() <= Math.ceil(e)); - Assert.assertNotNull(datum.toString()); - } - - @Test(dataProvider = "RecalDatumTestProvider") - public void testRecalDatumCopyAndCombine(RecalDatumTestProvider cfg) { - final RecalDatum datum = cfg.makeRecalDatum(); - final RecalDatum copy = new RecalDatum(datum); - assertBasicFeaturesOfRecalDatum(copy, cfg); - - RecalDatumTestProvider combinedCfg = new RecalDatumTestProvider(cfg.exError * 2, cfg.exTotal * 2, cfg.reportedQual); - copy.combine(datum); - assertBasicFeaturesOfRecalDatum(copy, combinedCfg); - } - - @Test(dataProvider = "RecalDatumTestProvider") - public void testRecalDatumModification(RecalDatumTestProvider cfg) { - RecalDatum datum = cfg.makeRecalDatum(); - datum.setEmpiricalQuality(10.1); - Assert.assertEquals(datum.getEmpiricalQuality(), 10.1); - - datum.setEstimatedQReported(10.1); - Assert.assertEquals(datum.getEstimatedQReported(), 10.1); - Assert.assertEquals(datum.getEstimatedQReportedAsByte(), 10); - - datum = cfg.makeRecalDatum(); - cfg.exTotal = 100000; - datum.setNumObservations(cfg.exTotal); - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - cfg.exError = 1000; - datum.setNumMismatches(cfg.exError); - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.increment(true); - cfg.exError++; - cfg.exTotal++; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.increment(false); - cfg.exTotal++; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.incrementNumObservations(2); - cfg.exTotal += 2; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - datum = cfg.makeRecalDatum(); - datum.incrementNumMismatches(2); - cfg.exError += 2; - assertBasicFeaturesOfRecalDatum(datum, cfg); - - - datum = cfg.makeRecalDatum(); - datum.increment(10, 5); - cfg.exError += 5; - cfg.exTotal += 10; - assertBasicFeaturesOfRecalDatum(datum, cfg); - } - - @Test - public void testNoObs() { - final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10); - Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0); - } - - @Test - public void testlog10QempPrior() { - for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { - for ( int Qrep = 0; Qrep <= QualityUtils.MAX_SAM_QUAL_SCORE; Qrep++ ) { - final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); - Assert.assertTrue(log10prior < 0.0); - Assert.assertFalse(Double.isInfinite(log10prior)); - Assert.assertFalse(Double.isNaN(log10prior)); - } - } - - final int Qrep = 20; - int maxQemp = -1; - double maxQempValue = -Double.MAX_VALUE; - for ( int Qemp = 0; Qemp <= QualityUtils.MAX_SAM_QUAL_SCORE; Qemp++ ) { - final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); - if ( log10prior > maxQempValue ) { - maxQemp = Qemp; - maxQempValue = log10prior; - } - } - Assert.assertEquals(maxQemp, Qrep); - } - - @Test - public void testBayesianEstimateOfEmpiricalQuality() { - - final int Qrep = 20; - - // test no shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep); - - // test small shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0); - - // test medium shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0); - - // test large shift - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0); - Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0); - } - - @Test - public void testlog10QempLikelihood() { - - final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 }; - final int[] observations = new int[] {0, 10, 1000, 1000000}; - final int[] errors = new int[] {0, 10, 1000, 1000000}; - - for ( double Qemp : Qemps ) { - for ( int observation : observations ) { - for ( int error : errors ) { - if ( error > observation ) - continue; - - final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error); - Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0); - Assert.assertFalse(Double.isInfinite(log10likelihood)); - Assert.assertFalse(Double.isNaN(log10likelihood)); - } - } - } - - long bigNum = new Long((long)Integer.MAX_VALUE); - bigNum *= 2L; - final double log10likelihood = RecalDatum.log10QempLikelihood(30, bigNum, 100000); - Assert.assertTrue(log10likelihood < 0.0); - Assert.assertFalse(Double.isInfinite(log10likelihood)); - Assert.assertFalse(Double.isNaN(log10likelihood)); - } - - @Test - public void basicHierarchicalBayesianQualityEstimateTest() { - - for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { - double RG_Q = 45.0; - RecalDatum RG = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); - double Q = 30.0; - RecalDatum QS = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); - RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality - - // initial epsilon condition shouldn't matter when there are a lot of observations - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); - } - - for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { - double RG_Q = 45.0; - RecalDatum RG = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); - double Q = 30.0; - RecalDatum QS = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); - RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality - - // initial epsilon condition dominates when there is no data - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), epsilon, 1E-4 ); - } - - } -} \ No newline at end of file diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalUtilsUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalUtilsUnitTest.java deleted file mode 100644 index 7fca0be93..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalUtilsUnitTest.java +++ /dev/null @@ -1,178 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -public final class RecalUtilsUnitTest extends BaseTest { - private class Row { - int rg, qual, ne, no; - - private Row(final Row copy) { - this(copy.rg, copy.qual, copy.ne, copy.no); - } - - private Row(int rg, int qual, int ne, int no) { - this.rg = rg; - this.qual = qual; - this.ne = ne; - this.no = no; - } - - @Override - public String toString() { - return "Row{" + - "" + rg + - ", " + qual + - ", " + ne + - ", " + no + - '}'; - } - } - - @DataProvider(name = "CombineTablesProvider") - public Object[][] createCombineTablesProvider() { - List tests = new ArrayList(); - - final List rows = new ArrayList(); - for ( final int rg : Arrays.asList(0, 1) ) { - for ( final int qual : Arrays.asList(0, 1) ) { - rows.add(new Row(rg, qual, 1, 10)); - } - } - - logger.warn("Number of rows " + rows.size()); - - List> permutations = new LinkedList>(); - permutations.addAll(Utils.makePermutations(rows, 1, false)); - permutations.addAll(Utils.makePermutations(rows, 2, false)); - permutations.addAll(Utils.makePermutations(rows, 3, false)); - - // adding 1 row to 2 - for ( final List table1 : permutations ) { - for ( final Row table2 : rows ) { - tests.add(new Object[]{table1, Arrays.asList(table2)}); - } - } - - // adding 2 rows to 1 - for ( final List table1 : permutations ) { - for ( final Row table2 : rows ) { - tests.add(new Object[]{Arrays.asList(table2), table1}); - } - } - - for ( final List table1 : permutations ) { - for ( final List table2 : permutations ) { - tests.add(new Object[]{table1, table2}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CombineTablesProvider") - public void testCombineTables(final List table1, final List table2) { - final NestedIntegerArray nia1 = makeTable(table1); - final NestedIntegerArray nia2 = makeTable(table2); - final List expectedRows = makeExpected(table1, table2); - final NestedIntegerArray expected = makeTable(expectedRows); - RecalUtils.combineTables(nia1, nia2); - - Assert.assertEquals(nia1.getDimensions(), expected.getDimensions()); - Assert.assertEquals(nia1.getAllValues().size(), expected.getAllValues().size()); - - for ( final NestedIntegerArray.Leaf leaf : expected.getAllLeaves() ) { - final RecalDatum actual = nia1.get(leaf.keys); - Assert.assertEquals(actual.getNumMismatches(), leaf.value.getNumMismatches()); - Assert.assertEquals(actual.getNumObservations(), leaf.value.getNumObservations()); - } - } - - public List makeExpected(final List table1, final List table2) { - final List combined = new LinkedList(); - for ( final Row t1 : table1 ) combined.add(new Row(t1)); - for ( final Row t2 : table2 ) { - combine(combined, t2); - } - return combined; - } - - private void combine(final List combined, final Row row) { - for ( final Row c : combined ) { - if ( c.rg == row.rg && c.qual == row.qual ) { - c.ne += row.ne; - c.no += row.no; - return; - } - } - - combined.add(new Row(row)); - } - - public NestedIntegerArray makeTable(final List rows) { - final NestedIntegerArray x = new NestedIntegerArray(3, 3); - for ( final Row r : rows ) - x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual); - return x; - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReportUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReportUnitTest.java deleted file mode 100644 index e38ce4687..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationReportUnitTest.java +++ /dev/null @@ -1,176 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * @author carneiro - * @since 4/21/12 - */ -public class RecalibrationReportUnitTest { - @BeforeMethod - public void init() { - ReadCovariates.clearKeysCache(); - } - - private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { - final Random random = new Random(); - final int nObservations = random.nextInt(maxObservations); - final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); - final int qual = random.nextInt(QualityUtils.MAX_SAM_QUAL_SCORE); - return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); - } - - @Test - public void testOutput() { - final int length = 100; - - List quals = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - List counts = new ArrayList(QualityUtils.MAX_SAM_QUAL_SCORE + 1); - - for (int i = 0; i<= QualityUtils.MAX_SAM_QUAL_SCORE; i++) { - quals.add((byte) i); - counts.add(1L); - } - - final QuantizationInfo quantizationInfo = new QuantizationInfo(quals, counts); - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - - quantizationInfo.noQuantization(); - final List requiredCovariates = new LinkedList(); - final List optionalCovariates = new LinkedList(); - - final ReadGroupCovariate rgCovariate = new ReadGroupCovariate(); - rgCovariate.initialize(RAC); - requiredCovariates.add(rgCovariate); - - final QualityScoreCovariate qsCovariate = new QualityScoreCovariate(); - qsCovariate.initialize(RAC); - requiredCovariates.add(qsCovariate); - - final ContextCovariate cxCovariate = new ContextCovariate(); - cxCovariate.initialize(RAC); - optionalCovariates.add(cxCovariate); - final CycleCovariate cyCovariate = new CycleCovariate(); - cyCovariate.initialize(RAC); - optionalCovariates.add(cyCovariate); - - final Covariate[] requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; - int covariateIndex = 0; - for (final Covariate cov : requiredCovariates) - requestedCovariates[covariateIndex++] = cov; - for (final Covariate cov : optionalCovariates) - requestedCovariates[covariateIndex++] = cov; - - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("id"); - rg.setPlatform("illumina"); - final GATKSAMRecord read = ReadUtils.createRandomRead(length, false); - read.setReadGroup(rg); - final byte [] readQuals = new byte[length]; - for (int i = 0; i < length; i++) - readQuals[i] = 20; - read.setBaseQualities(readQuals); - - final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); - int nKeys = 0; // keep track of how many keys were produced - final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); - final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); - final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); - - for (int offset = 0; offset < length; offset++) { - - for (EventType errorMode : EventType.values()) { - - final int[] covariates = rc.getKeySet(offset, errorMode); - final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - - rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); - qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); - nKeys += 2; - for (int j = 0; j < optionalCovariates.size(); j++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); - final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; - if ( covValue >= 0 ) { - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); - nKeys++; - } - } - } - } - Assert.assertEquals(nKeys, expectedKeys); - } - - private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { - final int numCovariates = 4; - final int numTables = 3; - final int mismatchContextPadding = mismatchesContextSize - 1; - final int indelContextPadding = 2 * (indelContextSize - 1); - final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); - - return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; - } - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTablesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTablesUnitTest.java deleted file mode 100644 index bed21cba1..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTablesUnitTest.java +++ /dev/null @@ -1,202 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.collections.NestedIntegerArray; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.List; - -public final class RecalibrationTablesUnitTest extends BaseTest { - private RecalibrationTables tables; - private Covariate[] covariates; - private int numReadGroups = 6; - final byte qualByte = 1; - final List combineStates = Arrays.asList(0, 1, 2); - - @BeforeMethod - private void makeTables() { - covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); - tables = new RecalibrationTables(covariates, numReadGroups); - fillTable(tables); - } - - private void fillTable(final RecalibrationTables tables) { - for ( int iterations = 0; iterations < 10; iterations++ ) { - for ( final EventType et : EventType.values() ) { - for ( final int rg : combineStates) { - final double error = rg % 2 == 0 ? 1 : 0; - RecalUtils.incrementDatumOrPutIfNecessary(tables.getReadGroupTable(), qualByte, error, rg, et.ordinal()); - for ( final int qual : combineStates) { - RecalUtils.incrementDatumOrPutIfNecessary(tables.getQualityScoreTable(), qualByte, error, rg, qual, et.ordinal()); - for ( final int cycle : combineStates) - RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(2), qualByte, error, rg, qual, cycle, et.ordinal()); - for ( final int context : combineStates) - RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(3), qualByte, error, rg, qual, context, et.ordinal()); - } - } - } - } - } - - @Test - public void basicTest() { - final Covariate qualCov = covariates[1]; - final Covariate cycleCov = covariates[2]; - final Covariate contextCov = covariates[3]; - - Assert.assertEquals(tables.numTables(), covariates.length); - - Assert.assertNotNull(tables.getReadGroupTable()); - Assert.assertEquals(tables.getReadGroupTable(), tables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal())); - testDimensions(tables.getReadGroupTable(), numReadGroups); - - Assert.assertNotNull(tables.getQualityScoreTable()); - Assert.assertEquals(tables.getQualityScoreTable(), tables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal())); - testDimensions(tables.getQualityScoreTable(), numReadGroups, qualCov.maximumKeyValue() + 1); - - Assert.assertNotNull(tables.getTable(2)); - testDimensions(tables.getTable(2), numReadGroups, qualCov.maximumKeyValue() + 1, cycleCov.maximumKeyValue() + 1); - - Assert.assertNotNull(tables.getTable(3)); - testDimensions(tables.getTable(3), numReadGroups, qualCov.maximumKeyValue() + 1, contextCov.maximumKeyValue() + 1); - } - - private void testDimensions(final NestedIntegerArray table, final int ... dimensions) { - final int[] dim = new int[dimensions.length+1]; - System.arraycopy(dimensions, 0, dim, 0, dimensions.length); - dim[dimensions.length] = EventType.values().length; - Assert.assertEquals(table.getDimensions().length, dim.length); - - for ( int i = 0; i < dim.length; i++ ) { - Assert.assertEquals(table.getDimensions()[i], dim[i], "Table dimensions not expected at dim " + i); - } - } - - @Test - public void basicMakeQualityScoreTable() { - final Covariate qualCov = covariates[1]; - final NestedIntegerArray copy = tables.makeQualityScoreTable(); - testDimensions(copy, numReadGroups, qualCov.maximumKeyValue()+1); - Assert.assertEquals(copy.getAllValues().size(), 0); - } - - @Test - public void testCombine1() { - final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); - fillTable(merged); - - merged.combine(tables); - - for ( int i = 0; i < tables.numTables(); i++ ) { - NestedIntegerArray table = tables.getTable(i); - NestedIntegerArray mergedTable = merged.getTable(i); - - Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); - for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { - final RecalDatum mergedValue = mergedTable.get(leaf.keys); - Assert.assertNotNull(mergedValue); - Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() * 2); - Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() * 2); - } - } - } - - @Test - public void testCombineEmptyOther() { - final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); - - merged.combine(tables); - - for ( int i = 0; i < tables.numTables(); i++ ) { - NestedIntegerArray table = tables.getTable(i); - NestedIntegerArray mergedTable = merged.getTable(i); - - Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); - for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { - final RecalDatum mergedValue = mergedTable.get(leaf.keys); - Assert.assertNotNull(mergedValue); - Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations()); - Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches()); - } - } - } - - @Test - public void testCombinePartial() { - final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); - for ( final int rg : combineStates) { - RecalUtils.incrementDatumOrPutIfNecessary(merged.getTable(3), qualByte, 1, rg, 0, 0, 0); - } - - merged.combine(tables); - for ( int i = 0; i < tables.numTables(); i++ ) { - NestedIntegerArray table = tables.getTable(i); - NestedIntegerArray mergedTable = merged.getTable(i); - - Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); - for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { - final RecalDatum mergedValue = mergedTable.get(leaf.keys); - Assert.assertNotNull(mergedValue); - - final int delta = i == 3 && leaf.keys[1] == 0 && leaf.keys[2] == 0 && leaf.keys[3] == 0 ? 1 : 0; - Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() + delta); - Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() + delta); - } - } - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTestUtils.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTestUtils.java deleted file mode 100644 index 306648ca3..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RecalibrationTestUtils.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; - -/** - * Created with IntelliJ IDEA. - * User: depristo - * Date: 12/23/12 - * Time: 1:06 PM - * To change this template use File | Settings | File Templates. - */ -public class RecalibrationTestUtils { - public static Covariate[] makeInitializedStandardCovariates() { - final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - final Covariate[] covariates = new Covariate[4]; - covariates[0] = new ReadGroupCovariate(); - covariates[1] = new QualityScoreCovariate(); - covariates[2] = new ContextCovariate(); - covariates[3] = new CycleCovariate(); - for ( Covariate cov : covariates ) cov.initialize(RAC); - return covariates; - } -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java deleted file mode 100644 index 877f4e911..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/recalibration/RepeatCovariatesUnitTest.java +++ /dev/null @@ -1,250 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.tools.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.gatk.utils.recalibration.covariates.*; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Random; - -public class RepeatCovariatesUnitTest { - - RepeatLengthCovariate rlCovariate; - RepeatUnitCovariate ruCovariate; - RepeatUnitAndLengthCovariate rurlCovariate; - RecalibrationArgumentCollection RAC; - - - - @BeforeClass - public void init() { - RAC = new RecalibrationArgumentCollection(); - rlCovariate = new RepeatLengthCovariate(); - ruCovariate = new RepeatUnitCovariate(); - rurlCovariate = new RepeatUnitAndLengthCovariate(); - rlCovariate.initialize(RAC); - ruCovariate.initialize(RAC); - rurlCovariate.initialize(RAC); - } - - @BeforeMethod - public void initCache() { - ReadCovariates.clearKeysCache(); - } - - - @Test - public void testFindNumberOfRepetitions() { - // First, test logic to compute number of repetitions of a substring on a given string. - int result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), true); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), true); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), true); - Assert.assertEquals(1,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), true); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), true); - Assert.assertEquals(0,result); - // Same tests but looking backward on string - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACAC".getBytes(), false); - Assert.assertEquals(2,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "ACACACACGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("AC".getBytes(), "GTACACACAC".getBytes(), false); - Assert.assertEquals(4,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GCAGCAGTAGGGTGTACACACAC".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCAGCA".getBytes(), "GTAGGGTGTACACACACGCAGCAT".getBytes(), false); - Assert.assertEquals(0,result); - result = GATKVariantContextUtils.findNumberOfRepetitions("GCA".getBytes(), "GTAGGGTGTACACACACGCAGCAGCA".getBytes(), false); - Assert.assertEquals(3,result); - - // test logic to get repeat unit and number of repeats from covariate value - final String[] repUnits = new String[]{"AG","CCG","TCCA","T"}; - for (String ru : repUnits) { - for (int k=1; k < 10; k++) { - Pair pair = RepeatLengthCovariate.getRUandNRfromCovariate(String.format("%s%d",ru,k)); - Assert.assertEquals(pair.second.intValue(),k); - Assert.assertEquals(pair.first,ru); - } - } - - } - - /** - * Build synthetic reads with random content made up of tandem repeats, record computed Repeat Unit and # repeats and see if - * they match with read context - */ - @Test - public void testManyObservations() { - final int NUM_UNITS = 10; - final int MAX_REPEAT_UNIT_LENGTH = RAC.MAX_STR_UNIT_LENGTH; - final int MAX_NUM_REPETITIONS = RAC.MAX_REPEAT_LENGTH; - final int NUM_TEST_CASES = 100; - - Random random = new Random(); - - for (int r = 0; r < NUM_TEST_CASES; r++) { - final StringBuilder sb = new StringBuilder(); - // for each unit, generate a repeat unit at random with given random length - final ArrayList repeatUnits = new ArrayList(); - final ArrayList numsRepetitions = new ArrayList(); - for (int n=0; n < NUM_UNITS; n++) { - final int repLength = 1+random.nextInt(MAX_REPEAT_UNIT_LENGTH); - final String repeatUnit = getRandomBases(repLength); - final int numRepetitions = 1+random.nextInt(MAX_NUM_REPETITIONS); - - // log for comparison with covariate - numsRepetitions.add(numRepetitions); - repeatUnits.add(repeatUnit); - - for (int k=0; k < numRepetitions; k++) - sb.append(repeatUnit); - - } - - final String readBases = sb.toString(); - System.out.println(readBases); - final int readLength = readBases.length(); - - final byte[] readQuals = new byte[readLength]; - Arrays.fill(readQuals,(byte)30); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(readBases.getBytes(),readQuals,readLength+"M"); - - Covariate[] requestedCovariates = new Covariate[3]; - requestedCovariates[0] = rlCovariate; - requestedCovariates[1] = ruCovariate; - requestedCovariates[2] = rurlCovariate; - ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); - - // check that the length is correct - Assert.assertEquals(rc.getMismatchesKeySet().length, readLength); - Assert.assertEquals(rc.getInsertionsKeySet().length, readLength); - Assert.assertEquals(rc.getDeletionsKeySet().length, readLength); - - for (int offset = 0; offset < readBases.length(); offset++) { // recalibrate all bases in the read - // check RepeatLength - final String rlValM = rlCovariate.formatKey(rc.getMismatchesKeySet(offset)[0]); - final String rlValI = rlCovariate.formatKey(rc.getInsertionsKeySet(offset)[0]); - final String rlValD = rlCovariate.formatKey(rc.getDeletionsKeySet(offset)[0]); - // check RepeatUnit - final String ruValM = ruCovariate.formatKey(rc.getMismatchesKeySet(offset)[1]); - final String ruValI = ruCovariate.formatKey(rc.getInsertionsKeySet(offset)[1]); - final String ruValD = ruCovariate.formatKey(rc.getDeletionsKeySet(offset)[1]); - // check RepeatUnitAndLength - final String rurlValM = rurlCovariate.formatKey(rc.getMismatchesKeySet(offset)[2]); - final String rurlValI = rurlCovariate.formatKey(rc.getInsertionsKeySet(offset)[2]); - final String rurlValD = rurlCovariate.formatKey(rc.getDeletionsKeySet(offset)[2]); - // check all 3 values are identical - Assert.assertEquals(rlValD,rlValI); - Assert.assertEquals(rlValM,rlValI); - Assert.assertEquals(ruValD,ruValI); - Assert.assertEquals(ruValM,ruValI); - Assert.assertEquals(rurlValD,rurlValI); - Assert.assertEquals(rurlValM,rurlValI); - - - int fw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(offset + 1, readLength).getBytes(), true); - int bw = GATKVariantContextUtils.findNumberOfRepetitions(ruValM.getBytes(), readBases.substring(0, offset + 1).getBytes(), false); - Assert.assertEquals(Math.min(fw+bw,RAC.MAX_REPEAT_LENGTH),(int)Integer.valueOf(rlValM)); - } - - } - - - - - - - } - - /** - * Returns random bases of given length - * @param length required length - * @return given random string - */ - @Requires("length > 0") - String getRandomBases(final int length) { - byte[] bases = new byte[length]; - Random ran = new Random(); - for (int i=0; i < length; i++ ) { - int idx = ran.nextInt(4); - bases[i] = BaseUtils.baseIndexToSimpleBase(idx); - } - return new String(bases); - } - - -} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextMergerUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextMergerUnitTest.java deleted file mode 100644 index eaa76becb..000000000 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextMergerUnitTest.java +++ /dev/null @@ -1,279 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE -* SOFTWARE LICENSE AGREEMENT -* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. PHONE-HOME FEATURE -* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* -* 4. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012-2014 Broad Institute, Inc. -* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 5. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 6. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 7. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 8. MISCELLANEOUS -* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.variant.variantcontext.*; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Tests {@link org.broadinstitute.gatk.utils.variant.ReferenceConfidenceVariantContextMerger}. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class VariantContextMergerUnitTest extends BaseTest { - Allele Aref, T, C, G, Cref, ATC, ATCATC; - Allele ATCATCT; - Allele ATref; - Allele Anoref; - Allele GT; - - private GenomeLocParser genomeLocParser; - - @BeforeSuite - public void setup() throws IOException { - // alleles - Aref = Allele.create("A", true); - Cref = Allele.create("C", true); - T = Allele.create("T"); - C = Allele.create("C"); - G = Allele.create("G"); - ATC = Allele.create("ATC"); - ATCATC = Allele.create("ATCATC"); - ATCATCT = Allele.create("ATCATCT"); - ATref = Allele.create("AT",true); - Anoref = Allele.create("A",false); - GT = Allele.create("GT",false); - genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); - } - - @Test(dataProvider = "referenceConfidenceMergeData") - public void testReferenceConfidenceMerge(final String testID, final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { - final VariantContext result = ReferenceConfidenceVariantContextMerger.merge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte) 'A' : null, true); - if ( result == null ) { - Assert.assertTrue(expectedResult == null); - return; - } - Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles(),testID); - Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples(),testID); - for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { - Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); - // use string comparisons to test equality for now - Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); - } - } - - @Test - public void testGenerateADWithNewAlleles() { - - final int[] originalAD = new int[] {1,2,0}; - final int[] indexesOfRelevantAlleles = new int[] {0,1,2,2}; - - final int[] newAD = ReferenceConfidenceVariantContextMerger.generateAD(originalAD, indexesOfRelevantAlleles); - Assert.assertEquals(newAD, new int[]{1,2,0,0}); - } - - - @Test(expectedExceptions = UserException.class) - public void testGetIndexesOfRelevantAllelesWithNoALT() { - - final List alleles1 = new ArrayList<>(1); - alleles1.add(Allele.create("A", true)); - final List alleles2 = new ArrayList<>(1); - alleles2.add(Allele.create("A", true)); - ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(alleles1, alleles2, -1); - Assert.fail("We should have thrown an exception because the allele was not present"); - } - - @Test(dataProvider = "getIndexesOfRelevantAllelesData") - public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { - final List myAlleles = new ArrayList<>(3); - - // always add the reference and alleles - myAlleles.add(allAlleles.get(0)); - myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - // optionally add another alternate allele - if ( allelesIndex > 0 ) - myAlleles.add(allAlleles.get(allelesIndex)); - - final int[] indexes = ReferenceConfidenceVariantContextMerger.getIndexesOfRelevantAlleles(myAlleles, allAlleles, -1); - - Assert.assertEquals(indexes.length, allAlleles.size()); - - for ( int i = 0; i < allAlleles.size(); i++ ) { - if ( i == 0 ) - Assert.assertEquals(indexes[i], 0); // ref should always match - else if ( i == allelesIndex ) - Assert.assertEquals(indexes[i], 2); // allele - else - Assert.assertEquals(indexes[i], 1); // - } - } - - - @DataProvider(name = "referenceConfidenceMergeData") - public Object[][] makeReferenceConfidenceMergeData() { - final List tests = new ArrayList<>(); - final int start = 10; - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); - final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); - final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); - - final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; - final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; - - final List noCalls = new ArrayList<>(2); - noCalls.add(Allele.NO_CALL); - noCalls.add(Allele.NO_CALL); - - final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).alleles(noCalls).make(); - final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); - final Allele AAref = Allele.create("AA", true); - final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).alleles(noCalls).make(); - final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); - final List A_C = Arrays.asList(Aref, C); - final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).alleles(noCalls).make(); - final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); - final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); - final List A_C_G = Arrays.asList(Aref, C, G); - final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).alleles(noCalls).make(); - final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).alleles(noCalls).make(); - final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); - final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); - final Allele A = Allele.create("A", false); - final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); - final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make(); - final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); - - // first test the case of a single record - tests.add(new Object[]{"test00",Arrays.asList(vcA_C_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); - - // now, test pairs: - // a SNP with another SNP - tests.add(new Object[]{"test01",Arrays.asList(vcA_C_ALT, vcA_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); - // a SNP with an indel - tests.add(new Object[]{"test02",Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make()}); - // a SNP with 2 SNPs - tests.add(new Object[]{"test03",Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); - // a SNP with a ref record - tests.add(new Object[]{"test04",Arrays.asList(vcA_C_ALT, vcA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); - - // spanning records: - // a SNP with a spanning ref record - tests.add(new Object[]{"test05",Arrays.asList(vcA_C_ALT, vcAA_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); - // a SNP with a spanning deletion - tests.add(new Object[]{"test06",Arrays.asList(vcA_C_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).alleles(noCalls).make()).make()}); - - // combination of all - tests.add(new Object[]{"test07",Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), - loc, false, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).alleles(noCalls).make(), - new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).alleles(noCalls).make(), - new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).alleles(noCalls).make(), - new GenotypeBuilder("A_C_G").PL(new int[]{40,20,30,20,10,30,71,72,73,74}).alleles(noCalls).make(), - new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).alleles(noCalls).make(), - new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).alleles(noCalls).make(), - new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).alleles(noCalls).make()).make()}); - - // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts - tests.add(new Object[]{"test08",Arrays.asList(vcAA_ALT), - - loc, false, - null}); - tests.add(new Object[]{"test09", Arrays.asList(vcAA_ALT), - loc, true, - new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).alleles(noCalls).make()).make()}); - - final Object[][] result = tests.toArray(new Object[][]{}); - return result; - } - @DataProvider(name = "getIndexesOfRelevantAllelesData") - public Object[][] makeGetIndexesOfRelevantAllelesData() { - final int totalAlleles = 5; - final List alleles = new ArrayList<>(totalAlleles); - alleles.add(Allele.create("A", true)); - for ( int i = 1; i < totalAlleles; i++ ) - alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); - - final List tests = new ArrayList<>(); - - for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { - tests.add(new Object[]{alleleIndex, alleles}); - } - - return tests.toArray(new Object[][]{}); - } -} diff --git a/public/external-example/pom.xml b/public/external-example/pom.xml index 5cde74b85..94406680d 100644 --- a/public/external-example/pom.xml +++ b/public/external-example/pom.xml @@ -49,7 +49,15 @@ org.broadinstitute.gatk - gatk-tools-public + gatk-utils + ${gatk.version} + test-jar + test + + + + org.broadinstitute.gatk + gatk-engine ${gatk.version} test-jar test @@ -82,7 +90,7 @@ org.broadinstitute.gatk - gatk-engine + gatk-utils ${gatk.version} example-resources tar.bz2 @@ -112,8 +120,7 @@ ${project.build.outputDirectory} org.broadinstitute.gatk - - gatk-tools-public + gatk-utils ${gatk.version} 2g diff --git a/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java index 1834c4a4a..8dedbdd59 100644 --- a/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java +++ b/public/external-example/src/main/java/org/mycompany/app/MyExampleWalker.java @@ -26,9 +26,9 @@ package org.mycompany.app; import org.broadinstitute.gatk.utils.commandline.Output; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import java.io.PrintStream; diff --git a/public/gatk-engine/pom.xml b/public/gatk-engine/pom.xml index 593c1feb4..6d2696c7a 100644 --- a/public/gatk-engine/pom.xml +++ b/public/gatk-engine/pom.xml @@ -24,6 +24,22 @@ gatk-utils ${project.version} + + net.java.dev.jets3t + jets3t + + + org.simpleframework + simple-xml + + + + ${project.groupId} + gatk-utils + ${project.version} + test-jar + test + com.google.caliper @@ -34,16 +50,6 @@ - - org.apache.maven.plugins - maven-assembly-plugin - - - example-resources - ${gatk.generate-resources.phase} - - - org.apache.maven.plugins maven-resources-plugin @@ -54,8 +60,6 @@ - org.apache.maven.plugins maven-invoker-plugin diff --git a/public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKBAMFileSpan.java b/public/gatk-engine/src/main/java/htsjdk/samtools/GATKBAMFileSpan.java similarity index 100% rename from public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKBAMFileSpan.java rename to public/gatk-engine/src/main/java/htsjdk/samtools/GATKBAMFileSpan.java diff --git a/public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKBin.java b/public/gatk-engine/src/main/java/htsjdk/samtools/GATKBin.java similarity index 100% rename from public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKBin.java rename to public/gatk-engine/src/main/java/htsjdk/samtools/GATKBin.java diff --git a/public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKChunk.java b/public/gatk-engine/src/main/java/htsjdk/samtools/GATKChunk.java similarity index 100% rename from public/gatk-tools-public/src/main/java/htsjdk/samtools/GATKChunk.java rename to public/gatk-engine/src/main/java/htsjdk/samtools/GATKChunk.java diff --git a/public/gatk-tools-public/src/main/java/htsjdk/samtools/PicardNamespaceUtils.java b/public/gatk-engine/src/main/java/htsjdk/samtools/PicardNamespaceUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/htsjdk/samtools/PicardNamespaceUtils.java rename to public/gatk-engine/src/main/java/htsjdk/samtools/PicardNamespaceUtils.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java new file mode 100644 index 000000000..a2bb4afd9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java @@ -0,0 +1,229 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.io.stubs.OutputStreamArgumentTypeDescriptor; +import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterArgumentTypeDescriptor; +import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.engine.crypt.GATKKey; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.text.ListFileUtils; + +import java.security.PublicKey; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; + +/** + * @author aaron + */ +public abstract class CommandLineExecutable extends CommandLineProgram { + /** + * The actual engine which performs the analysis. + */ + protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + + // get the analysis name + public abstract String getAnalysisName(); + + /** + * Gets the GATK argument bundle. + * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. + */ + protected abstract GATKArgumentCollection getArgumentCollection(); + + /** + * A list of all the arguments initially used as sources. + */ + private final Collection argumentSources = new ArrayList(); + + protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); + + /** + * this is the function that the inheriting class can expect to have called + * when the command line system has initialized. + * + * @return the return code to exit the program with + */ + protected int execute() throws Exception { + engine.setParser(parser); + argumentSources.add(this); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + + try { + // Make sure a valid GATK user key is present, if required. + authorizeGATKRun(); + + engine.setArguments(getArgumentCollection()); + + // File lists can require a bit of additional expansion. Set these explicitly by the engine. + final Collection bamFileList=ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser); + engine.setSAMFileIDs(bamFileList); + if(getArgumentCollection().showFullBamList){ + logger.info(String.format("Adding the following input SAM Files: %s",bamFileList.toString())); + } + + engine.setWalker(walker); + walker.setToolkit(engine); + + Collection filters = engine.createFilters(); + engine.setFilters(filters); + + // load the arguments into the walker / filters. + // TODO: The fact that this extra load call exists here when all the parsing happens at the engine + // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive + // TODO: argument processing. + loadArgumentsIntoObject(walker); + argumentSources.add(walker); + + Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); + engine.setReferenceMetaDataFiles(rodBindings); + + for (ReadFilter filter: filters) { + loadArgumentsIntoObject(filter); + argumentSources.add(filter); + } + + engine.execute(); + generateGATKRunReport(walker); + } catch ( Exception e ) { + generateGATKRunReport(walker, e); + throw e; + } + + // always return 0 + return 0; + } + + /** + * Authorizes this run of the GATK by checking for a valid GATK user key, if required. + * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. + */ + private void authorizeGATKRun() { + if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || + getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { + if ( getArgumentCollection().gatkKeyFile == null ) { + throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + + "Please see " + UserException.PHONE_HOME_DOCS_URL + + " for more information and instructions on how to obtain a key."); + } + else { + PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); + + if ( ! gatkUserKey.isValid() ) { + throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); + } + } + } + } + + /** + * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. + * This report will be written to either STDOUT or to the run repository, depending on the options + * for -et. + * + * @param e the exception, can be null if no exception occurred + */ + private void generateGATKRunReport(Walker walker, Exception e) { + if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { + GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); + report.postReport(getArgumentCollection().phoneHomeType); + } + } + + /** + * Convenience method for fully parameterized generateGATKRunReport when an exception has + * not occurred + * + * @param walker + */ + private void generateGATKRunReport(Walker walker) { + generateGATKRunReport(walker, null); + } + + /** + * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. + * @return A collection of type descriptors generating implementation-dependent placeholders. + */ + protected Collection getArgumentTypeDescriptors() { + return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), + new SAMFileWriterArgumentTypeDescriptor(engine,System.out), + new OutputStreamArgumentTypeDescriptor(engine,System.out) ); + } + + /** + * GATK can add arguments dynamically based on analysis type. + * + * @return true + */ + @Override + protected boolean canAddArgumentsDynamically() { + return true; + } + + /** + * GATK provides the walker as an argument source. + * @return List of walkers to load dynamically. + */ + @Override + protected Class[] getArgumentSources() { + // No walker info? No plugins. + if (getAnalysisName() == null) return new Class[] {}; + + Collection argumentSources = new ArrayList(); + + Walker walker = engine.getWalkerByName(getAnalysisName()); + engine.setArguments(getArgumentCollection()); + engine.setWalker(walker); + walker.setToolkit(engine); + argumentSources.add(walker.getClass()); + + Collection filters = engine.createFilters(); + for(ReadFilter filter: filters) + argumentSources.add(filter.getClass()); + + Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; + return argumentSources.toArray(argumentSourcesAsArray); + } + + @Override + protected String getArgumentSourceName( Class argumentSource ) { + return engine.getWalkerName((Class)argumentSource); + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java new file mode 100644 index 000000000..afecc2e12 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java @@ -0,0 +1,371 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.SAMException; +import htsjdk.tribble.TribbleException; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.engine.walkers.Attribution; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.*; +import org.broadinstitute.gatk.utils.text.TextFormattingUtils; + +import java.util.*; + +/** + * All command line parameters accepted by all tools in the GATK. + * + *

Info for general users

+ * + *

This is a list of options and parameters that are generally available to all tools in the GATK.

+ * + *

There may be a few restrictions, which are indicated in individual argument descriptions. For example the -BQSR + * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used + * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This + * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just + * skimming the one-line summaey in the table.

+ * + *

Info for developers

+ * + *

This class is the GATK engine itself, which manages map/reduce data access and runs walkers.

+ * + *

We run command line GATK programs using this class. It gets the command line args, parses them, and hands the + * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here; + * the GATK engine should deal with any data related information.

+ */ +@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) +public class CommandLineGATK extends CommandLineExecutable { + /** + * A complete list of tools (sometimes also called walkers because they "walk" through the data to perform analyses) + * is available in the online documentation. + */ + @Argument(fullName = "analysis_type", shortName = "T", doc = "Name of the tool to run") + private String analysisName = null; + + // our argument collection, the collection of command line args we accept + @ArgumentCollection + private GATKArgumentCollection argCollection = new GATKArgumentCollection(); + + /** + * Get pleasing info about the GATK. + * + * @return A list of Strings that contain pleasant info about the GATK. + */ + @Override + protected ApplicationDetails getApplicationDetails() { + return new ApplicationDetails(createApplicationHeader(), + getAttribution(), + ApplicationDetails.createDefaultRunningInstructions(getClass()), + getAdditionalHelp()); + } + + @Override + public String getAnalysisName() { + return analysisName; + } + + @Override + protected GATKArgumentCollection getArgumentCollection() { + return argCollection; + } + + /** + * Required main method implementation. + */ + public static void main(String[] argv) { + try { + CommandLineGATK instance = new CommandLineGATK(); + start(instance, argv); + System.exit(CommandLineProgram.result); // todo -- this is a painful hack + } catch (UserException e) { + exitSystemWithUserError(e); + } catch (TribbleException e) { + // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are + // lazy loaded, so they aren't caught elsewhere and made into User Exceptions + exitSystemWithUserError(e); + } catch (SAMException e) { + checkForMaskedUserErrors(e); + exitSystemWithSamError(e); + } catch (OutOfMemoryError e) { + exitSystemWithUserError(new UserException.NotEnoughMemory()); + } catch (Throwable t) { + checkForMaskedUserErrors(t); + exitSystemWithError(t); + } + } + + public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; + public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; + + private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error + final String message = t.getMessage(); + if ( message == null ) + return; + + // too many open files error + if ( message.contains("Too many open files") ) + exitSystemWithUserError(new UserException.TooManyOpenFiles()); + + // malformed BAM looks like a SAM file + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + exitSystemWithSamError(t); + + // can't close tribble index when writing + if ( message.contains("Unable to close index for") ) + exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); + + // disk is full + if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); + } + + /** + * Creates the a short blurb about the GATK, copyright info, and where to get documentation. + * + * @return The application header. + */ + public static List createApplicationHeader() { + List header = new ArrayList(); + header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); + header.add("Copyright (c) 2010 The Broad Institute"); + header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); + return header; + } + + /** + * If the user supplied any additional attribution, return it here. + * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. + */ + private List getAttribution() { + List attributionLines = new ArrayList(); + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + String analysisName = getAnalysisName(); + if(analysisName != null && walkerManager.exists(analysisName)) { + Class walkerType = walkerManager.getWalkerClassByName(analysisName); + if(walkerType.isAnnotationPresent(Attribution.class)) + attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); + } + return attributionLines; + } + + /** + * Retrieves additional information about GATK walkers. + * the code in HelpFormatter and supply it as a helper to this method. + * + * @return A string summarizing the walkers available in this distribution. + */ + private String getAdditionalHelp() { + String additionalHelp; + + // If no analysis name is present, fill in extra help on the walkers. + WalkerManager walkerManager = engine.getWalkerManager(); + String analysisName = getAnalysisName(); + if(analysisName != null && walkerManager.exists(getAnalysisName())) + additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(getAnalysisName())); + else + additionalHelp = getAllWalkerHelp(); + + return additionalHelp; + } + + private static final int PACKAGE_INDENT = 1; + private static final int WALKER_INDENT = 3; + private static final String FIELD_SEPARATOR = " "; + + private String getWalkerHelp(Class walkerType) { + // Construct a help string to output details on this walker. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + formatter.format("Available Reference Ordered Data types:%n"); + formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); + formatter.format("%n"); + + formatter.format("For a full description of this walker, see its GATKdocs at:%n"); + formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); + + return additionalHelp.toString(); + } + + /** + * Load in additional help information about all available walkers. + * @return A string representation of the additional help. + */ + private String getAllWalkerHelp() { + // Construct a help string to output available walkers. + StringBuilder additionalHelp = new StringBuilder(); + Formatter formatter = new Formatter(additionalHelp); + + // Get the list of walker names from the walker manager. + WalkerManager walkerManager = engine.getWalkerManager(); + + // Build a list sorted by walker display name. As this information is collected, keep track of the longest + // package / walker name for later formatting. + SortedSet helpText = new TreeSet(new HelpEntryComparator()); + + int longestPackageName = 0; + int longestWalkerName = 0; + for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { + // Get the display name. + String packageName = walkersByPackage.getKey(); + String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); + String packageHelpText = walkerManager.getPackageSummaryText(packageName); + + // Compute statistics about which names is longest. + longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); + + SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); + for(Class walkerType: walkersByPackage.getValue()) { + String walkerName = walkerType.getName(); + String walkerDisplayName = walkerManager.getName(walkerType); + String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); + + longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); + + walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); + } + + // Dump the walkers into the sorted set. + helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); + } + + final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); + + + for(HelpEntry packageHelp: helpText) { + printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + for(HelpEntry walkerHelp: packageHelp.children) + printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); + + // Print a blank line between sets of walkers. + printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); + } + + return additionalHelp.toString(); + } + + private void printDescriptorLine(Formatter formatter, + int headerIndentWidth, + String header, + int headerWidth, + String fieldSeparator, + String description, + int lineWidth) { + final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; + final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; + List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); + + String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; + String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; + String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; + + // Output description line. + formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", + "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); + for(int i = 1; i < wordWrappedText.size(); i++) + formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); + } + +} + +/** + * Represents a given help entry; contains a display name, a summary and optionally some children. + */ +class HelpEntry { + public final String uid; + public final String displayName; + public final String summary; + public final SortedSet children; + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + * @param children children for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary, SortedSet children) { + this.uid = uid; + this.displayName = displayName; + this.summary = summary; + this.children = children; + } + + /** + * Create a new help entry with the given display name, summary and children. + * @param uid a unique identifier. Usually, the java package. + * @param displayName display name for this help entry. + * @param summary summary for this help entry. + */ + public HelpEntry(String uid, String displayName, String summary) { + this(uid,displayName,summary,null); + } + +} + +/** + * Compare two help entries by display name. + */ +class HelpEntryComparator implements Comparator { + private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); + + /** + * Compares the order of lhs to rhs, not taking case into account. + * @param lhs First object to compare. + * @param rhs Second object to compare. + * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. + */ + public int compare(HelpEntry lhs, HelpEntry rhs) { + if(lhs == null && rhs == null) return 0; + if(lhs == null || lhs.displayName.equals("")) return 1; + if(rhs == null || rhs.displayName.equals("")) return -1; + return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); + } + + +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java new file mode 100644 index 000000000..883ed7fd8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GATKVCFUtils.java @@ -0,0 +1,263 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import htsjdk.tribble.FeatureCodecHeader; +import htsjdk.tribble.index.DynamicIndexCreator; +import htsjdk.tribble.index.IndexCreator; +import htsjdk.tribble.index.IndexFactory; +import htsjdk.tribble.index.interval.IntervalIndexCreator; +import htsjdk.tribble.index.linear.LinearIndexCreator; +import htsjdk.tribble.index.tabix.TabixFormat; +import htsjdk.tribble.index.tabix.TabixIndexCreator; +import htsjdk.tribble.readers.LineIterator; +import htsjdk.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.collections.Pair; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.*; + + +/** + * A set of GATK-specific static utility methods for common operations on VCF files/records. + */ +public class GATKVCFUtils { + + /** + * Constructor access disallowed...static utility methods only! + */ + private GATKVCFUtils() { } + + public static final Logger logger = Logger.getLogger(GATKVCFUtils.class); + public final static String GATK_COMMAND_LINE_KEY = "GATKCommandLine"; + + public final static GATKVCFIndexType DEFAULT_INDEX_TYPE = GATKVCFIndexType.DYNAMIC_SEEK; // by default, optimize for seek time. All indices prior to Nov 2013 used this type. + public final static Integer DEFAULT_INDEX_PARAMETER = -1; // the default DYNAMIC_SEEK does not use a parameter + + /** + * Gets the appropriately formatted header for a VCF file describing this GATK run + * + * @param engine the GATK engine that holds the walker name, GATK version, and other information + * @param argumentSources contains information on the argument values provided to the GATK for converting to a + * command line string. Should be provided from the data in the parsing engine. Can be + * empty in which case the command line will be the empty string. + * @return VCF header line describing this run of the GATK. + */ + public static VCFHeaderLine getCommandLineArgumentHeaderLine(final GenomeAnalysisEngine engine, final Collection argumentSources) { + if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); + if ( argumentSources == null ) throw new IllegalArgumentException("argumentSources cannot be null"); + + final Map attributes = new LinkedHashMap<>(); + attributes.put("ID", engine.getWalkerName()); + attributes.put("Version", CommandLineGATK.getVersionNumber()); + final Date date = new Date(); + attributes.put("Date", date.toString()); + attributes.put("Epoch", Long.toString(date.getTime())); + attributes.put("CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray())); + return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { + // Collect the eval rod names + final Set names = new TreeSet(); + for ( final RodBinding evalRod : rodBindings ) + names.add(evalRod.getName()); + return getVCFHeadersFromRods(toolkit, names); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) { + return getVCFHeadersFromRods(toolkit, (Collection)null); + } + + public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if it's not in our list + if ( rodNames != null && !rodNames.contains(source.getName()) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + + public static Map getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) { + Map data = new HashMap(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if lacks the prefix + if ( ! source.getName().startsWith(prefix) ) + continue; + + if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) + data.put(source.getName(), (VCFHeader)source.getHeader()); + } + + return data; + } + + /** + * Gets the header fields from all VCF rods input by the user + * + * @param toolkit GATK engine + * + * @return a set of all fields + */ + public static Set getHeaderFields(GenomeAnalysisEngine toolkit) { + return getHeaderFields(toolkit, null); + } + + /** + * Gets the header fields from all VCF rods input by the user + * + * @param toolkit GATK engine + * @param rodNames names of rods to use, or null if we should use all possible ones + * + * @return a set of all fields + */ + public static Set getHeaderFields(GenomeAnalysisEngine toolkit, Collection rodNames) { + + // keep a map of sample name to occurrences encountered + TreeSet fields = new TreeSet(); + + // iterate to get all of the sample names + List dataSources = toolkit.getRodDataSources(); + for ( ReferenceOrderedDataSource source : dataSources ) { + // ignore the rod if it's not in our list + if ( rodNames != null && !rodNames.contains(source.getName()) ) + continue; + + if ( source.getRecordType().equals(VariantContext.class)) { + VCFHeader header = (VCFHeader)source.getHeader(); + if ( header != null ) + fields.addAll(header.getMetaDataInSortedOrder()); + } + } + + return fields; + } + + /** + * Add / replace the contig header lines in the VCFHeader with the information in the GATK engine + * + * @param header the header to update + * @param engine the GATK engine containing command line arguments and the master sequence dictionary + */ + public static VCFHeader withUpdatedContigs(final VCFHeader header, final GenomeAnalysisEngine engine) { + return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); + } + + /** + * Create and return an IndexCreator + * @param type + * @param parameter + * @param outFile + * @return + */ + public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile) { + return getIndexCreator(type, parameter, outFile, null); + } + + /** + * Create and return an IndexCreator + * @param type + * @param parameter + * @param outFile + * @param sequenceDictionary + * @return + */ + public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile, SAMSequenceDictionary sequenceDictionary) { + if (ArgumentTypeDescriptor.isCompressed(outFile.toString())) { + if (type != GATKVCFUtils.DEFAULT_INDEX_TYPE || parameter != GATKVCFUtils.DEFAULT_INDEX_PARAMETER) + logger.warn("Creating Tabix index for " + outFile + ", ignoring user-specified index type and parameter"); + + if (sequenceDictionary == null) + return new TabixIndexCreator(TabixFormat.VCF); + else + return new TabixIndexCreator(sequenceDictionary, TabixFormat.VCF); + } + + IndexCreator idxCreator; + switch (type) { + case DYNAMIC_SEEK: idxCreator = new DynamicIndexCreator(outFile, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); break; + case DYNAMIC_SIZE: idxCreator = new DynamicIndexCreator(outFile, IndexFactory.IndexBalanceApproach.FOR_SIZE); break; + case LINEAR: idxCreator = new LinearIndexCreator(outFile, parameter); break; + case INTERVAL: idxCreator = new IntervalIndexCreator(outFile, parameter); break; + default: throw new IllegalArgumentException("Unknown IndexCreator type: " + type); + } + + return idxCreator; + } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * SHOULD ONLY BE USED FOR UNIT/INTEGRATION TESTING PURPOSES! + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws java.io.IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + final LineIterator vcfSource = codec.makeSourceFromStream(pbs); + try { + final VCFHeader vcfHeader = (VCFHeader) codec.readActualHeader(vcfSource); + + while (vcfSource.hasNext()) { + final VariantContext vc = codec.decode(vcfSource); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } finally { + codec.close(vcfSource); + } + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java new file mode 100644 index 000000000..6e2343d5d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java @@ -0,0 +1,1287 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.variant.vcf.VCFConstants; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.executive.MicroScheduler; +import org.broadinstitute.gatk.engine.filters.FilterManager; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.filters.ReadGroupBlackListFilter; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.io.stubs.Stub; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.iterators.ReadTransformersMode; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.refdata.tracks.IndexDictionaryUtils; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.samples.SampleDB; +import org.broadinstitute.gatk.engine.samples.SampleDBBuilder; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.genotyper.IndexedSampleList; +import org.broadinstitute.gatk.utils.genotyper.SampleList; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; +import org.broadinstitute.gatk.engine.recalibration.BQSRArgumentSet; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.text.XReadLines; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; +import java.util.concurrent.TimeUnit; + +import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; +import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.isDeprecatedWalker; + +/** + * A GenomeAnalysisEngine that runs a specified walker. + */ +public class GenomeAnalysisEngine { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); + public static final long NO_RUNTIME_LIMIT = -1; + + /** + * The GATK command-line argument parsing code. + */ + private ParsingEngine parsingEngine; + + /** + * The genomeLocParser can create and parse GenomeLocs. + */ + private GenomeLocParser genomeLocParser; + + /** + * Accessor for sharded read data. + */ + private SAMDataSource readsDataSource = null; + + /** + * Accessor for sharded reference data. + */ + private ReferenceDataSource referenceDataSource = null; + + /** + * Accessor for sample metadata + */ + private SampleDB sampleDB = new SampleDB(); + + /** + * Accessor for sharded reference-ordered data. + */ + private List rodDataSources; + + // our argument collection + private GATKArgumentCollection argCollection; + + /** + * Collection of intervals used by the engine. + */ + private GenomeLocSortedSet intervals = null; + + /** + * Explicitly assign the interval set to use for this traversal (for unit testing purposes) + * @param intervals set of intervals to use for this traversal + */ + public void setIntervals( GenomeLocSortedSet intervals ) { + this.intervals = intervals; + } + + /** + * Collection of inputs used by the engine. + */ + private Map inputs = new HashMap(); + + /** + * Collection of outputs used by the engine. + */ + private Collection> outputs = new ArrayList>(); + + /** + * Collection of the filters applied to the input data. + */ + private Collection filters; + + /** + * Collection of the read transformers applied to the reads + */ + private List readTransformers; + + /** + * Controls the allocation of threads between CPU vs IO. + */ + private ThreadAllocation threadAllocation; + + private ReadMetrics cumulativeMetrics = null; + + /** + * A currently hacky unique name for this GATK instance + */ + private String myName = "GATK_" + Math.abs(Utils.getRandomGenerator().nextInt()); + + /** + * our walker manager + */ + private final WalkerManager walkerManager = new WalkerManager(); + + private Walker walker; + + public void setWalker(Walker walker) { + this.walker = walker; + } + + /** + * The short name of the current GATK walker as a string + * @return a non-null String + */ + public String getWalkerName() { + return getWalkerName(walker.getClass()); + } + + /** + * A processed collection of SAM reader identifiers. + */ + private Collection samReaderIDs = Collections.emptyList(); + + /** + * Set the SAM/BAM files over which to traverse. + * @param samReaderIDs Collection of ids to use during this traversal. + */ + public void setSAMFileIDs(Collection samReaderIDs) { + this.samReaderIDs = samReaderIDs; + } + + /** + * Collection of reference metadata files over which to traverse. + */ + private Collection referenceMetaDataFiles; + + /** + * The threading efficiency monitor we use in the GATK to monitor our efficiency. + * + * May be null if one isn't active, or hasn't be initialized yet + */ + private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * The global progress meter we are using to track our progress through the genome + */ + private ProgressMeter progressMeter = null; + + /** + * Set the reference metadata files to use for this traversal. + * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. + */ + public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { + this.referenceMetaDataFiles = referenceMetaDataFiles; + } + + /** + * The maximum runtime of this engine, in nanoseconds, set during engine initialization + * from the GATKArgumentCollection command line value + */ + private long runtimeLimitInNanoseconds = -1; + + /** + * Base Quality Score Recalibration helper object + */ + private BQSRArgumentSet bqsrArgumentSet = null; + public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } + public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } + public void setBaseRecalibration(final GATKArgumentCollection args) { + bqsrArgumentSet = new BQSRArgumentSet(args); + } + + /** + * Actually run the GATK with the specified walker. + * + * @return the value of this traversal. + */ + public Object execute() { + // first thing is to make sure the AWS keys can be decrypted + GATKRunReport.checkAWSAreValid(); + + //HeapSizeMonitor monitor = new HeapSizeMonitor(); + //monitor.start(); + setStartTime(new java.util.Date()); + + final GATKArgumentCollection args = this.getArguments(); + + // validate our parameters + if (args == null) { + throw new ReviewedGATKException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); + } + + // validate our parameters + if (this.walker == null) + throw new ReviewedGATKException("The walker passed to GenomeAnalysisEngine can not be null."); + + if (args.nonDeterministicRandomSeed) + Utils.resetRandomGenerator(System.currentTimeMillis()); + + // if the use specified an input BQSR recalibration table then enable on the fly recalibration + if (args.BQSR_RECAL_FILE != null) + setBaseRecalibration(args); + + // setup the runtime limits + setupRuntimeLimits(args); + + // Determine how the threads should be divided between CPU vs. IO. + determineThreadAllocation(); + + // Prepare the data for traversal. + initializeDataSources(); + + // initialize and validate the interval list + initializeIntervals(); + validateSuppliedIntervals(); + + // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary + validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); + + // initialize sampleDB + initializeSampleDB(); + + // our microscheduler, which is in charge of running everything + MicroScheduler microScheduler = createMicroscheduler(); + threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); + + // create temp directories as necessary + initializeTempDirectory(); + + // create the output streams + initializeOutputStreams(microScheduler.getOutputTracker()); + + // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on + logger.info("Preparing for traversal" + + (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); + Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); + logger.info("Done preparing for traversal"); + + // execute the microscheduler, storing the results + return microScheduler.execute(this.walker, shardStrategy); + + //monitor.stop(); + //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); + + //return result; + } + + /** + * Retrieves an instance of the walker based on the walker name. + * + * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. + * @return An instance of the walker. + */ + public Walker getWalkerByName(String walkerName) { + try { + return walkerManager.createByName(walkerName); + } catch ( UserException e ) { + if ( isDeprecatedWalker(walkerName) ) { + e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); + } + throw e; + } + } + + /** + * Gets the name of a given walker type. + * @param walkerType Type of walker. + * @return Name of the walker. + */ + public String getWalkerName(Class walkerType) { + return walkerManager.getName(walkerType); + } + + public String getName() { + return myName; + } + + /** + * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; + * the caller must handle that directly. + * @return A collection of available filters. + */ + public Collection createFilters() { + final List filters = new LinkedList<>(); + + // First add the user requested filters + if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) + filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); + for(final String filterName: this.getArguments().readFilters) + filters.add(this.getFilterManager().createByName(filterName)); + + // now add the walker default filters. This ordering is critical important if + // users need to apply filters that fix up reads that would be removed by default walker filters + filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); + + return Collections.unmodifiableList(filters); + } + + /** + * Returns a list of active, initialized read transformers + * + * @param walker the walker we need to apply read transformers too + */ + public void initializeReadTransformers(final Walker walker) { + // keep a list of the active read transformers sorted based on priority ordering + List activeTransformers = new ArrayList(); + + final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); + final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; + + final PluginManager pluginManager = new PluginManager(ReadTransformer.class); + + for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { + transformer.initialize(overrideTime, this, walker); + if ( transformer.enabled() ) + activeTransformers.add(transformer); + } + + setReadTransformers(activeTransformers); + } + + public List getReadTransformers() { + return readTransformers; + } + + /* + * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). + * + * @param readTransformers the active read transformers + */ + protected void checkActiveReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new IllegalArgumentException("read transformers cannot be null"); + + ReadTransformer sawMustBeFirst = null; + ReadTransformer sawMustBeLast = null; + + for ( final ReadTransformer r : readTransformers ) { + if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { + if ( sawMustBeFirst != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); + sawMustBeFirst = r; + } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { + if ( sawMustBeLast != null ) + throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); + sawMustBeLast = r; + } + } + } + + protected void setReadTransformers(final List readTransformers) { + if ( readTransformers == null ) + throw new ReviewedGATKException("read transformers cannot be null"); + + // sort them in priority order + Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); + + // make sure we don't have an invalid set of active read transformers + checkActiveReadTransformers(readTransformers); + + this.readTransformers = readTransformers; + } + + /** + * Parse out the thread allocation from the given command-line argument. + */ + private void determineThreadAllocation() { + if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); + if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); + if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); + + this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, + argCollection.numberOfCPUThreadsPerDataThread, + argCollection.numberOfIOThreads, + argCollection.monitorThreadEfficiency); + } + + public int getTotalNumberOfThreads() { + return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); + } + + + + /** + * Allow subclasses and others within this package direct access to the walker manager. + * @return The walker manager used by this package. + */ + protected WalkerManager getWalkerManager() { + return walkerManager; + } + + /** + * setup a microscheduler + * + * @return a new microscheduler + */ + private MicroScheduler createMicroscheduler() { + // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. + if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && + this.getArguments().referenceFile == null) { + throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); + } + + return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); + } + + protected DownsamplingMethod getDownsamplingMethod() { + GATKArgumentCollection argCollection = this.getArguments(); + + DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); + + DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; + checkCompatibilityWithWalker(method, walker); + return method; + } + + private static void checkCompatibilityWithWalker( DownsamplingMethod method, Walker walker ) { + // Refactored from DownsamplingMethod + final DownsampleType type = method.type; + final Integer toCoverage = method.toCoverage; + final boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; + + if ( isLocusTraversal && type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not currently supported (though it is supported for ReadWalkers)."); + } + + // For locus traversals, ensure that the dcov value (if present) is not problematically low + if ( isLocusTraversal && type != DownsampleType.NONE && toCoverage != null && + toCoverage < DownsamplingMethod.MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS ) { + throw new UserException(String.format("Locus-based traversals (ie., Locus and ActiveRegion walkers) require " + + "a minimum -dcov value of %d when downsampling to coverage. Values less " + + "than this can produce problematic downsampling artifacts while providing " + + "only insignificant improvements in memory usage in most cases.", + DownsamplingMethod.MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS)); + } + } + + protected void setDownsamplingMethod(DownsamplingMethod method) { + argCollection.setDownsamplingMethod(method); + } + + protected boolean includeReadsWithDeletionAtLoci() { + return walker.includeReadsWithDeletionAtLoci(); + } + + /** + * Verifies that the supplied set of reads files mesh with what the walker says it requires; + * also makes sure that list of SAM files specified on the command line is not empty and contains + * no duplicates. + */ + protected void validateSuppliedReads() { + GATKArgumentCollection arguments = this.getArguments(); + final Boolean samFilesArePresent = (arguments.samFiles != null && !arguments.samFiles.isEmpty()); + + // Check what the walker says is required against what was provided on the command line. + if (WalkerManager.isRequired(walker, DataSource.READS) && !samFilesArePresent) + throw new ArgumentException("Walker requires reads but none were provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (samFilesArePresent && !WalkerManager.isAllowed(walker, DataSource.READS)) + throw new ArgumentException("Walker does not allow reads but reads were provided."); + + //Make sure SAM list specified by the user (if necessary) is not empty + if(WalkerManager.isRequired(walker, DataSource.READS) && samFilesArePresent && samReaderIDs.isEmpty() ) { + throw new UserException("The list of input files does not contain any BAM files."); + } + + // Make sure no SAM files were specified multiple times by the user. + checkForDuplicateSamFiles(); + } + + /** + * Checks whether there are SAM files that appear multiple times in the fully unpacked list of + * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. + */ + protected void checkForDuplicateSamFiles() { + Set encounteredSamFiles = new HashSet(); + Set duplicateSamFiles = new LinkedHashSet(); + + for ( SAMReaderID samFile : samReaderIDs ) { + if ( encounteredSamFiles.contains(samFile) ) { + duplicateSamFiles.add(samFile.getSamFilePath()); + } + else { + encounteredSamFiles.add(samFile); + } + } + + if ( duplicateSamFiles.size() > 0 ) { + throw new UserException("The following BAM files appear multiple times in the list of input files: " + + duplicateSamFiles + " BAM files may be specified at most once."); + } + + } + + /** + * Verifies that the supplied reference file mesh with what the walker says it requires. + */ + protected void validateSuppliedReference() { + GATKArgumentCollection arguments = this.getArguments(); + // Check what the walker says is required against what was provided on the command line. + // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. + if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) + throw new ArgumentException("Walker requires a reference but none was provided."); + + // Check what the walker says is allowed against what was provided on the command line. + if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) + throw new ArgumentException("Walker does not allow a reference but one was provided."); + } + + protected void validateSuppliedIntervals() { + // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. + if(!(walker instanceof ReadWalker)) { + GenomeLocSortedSet intervals = getIntervals(); + if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) + throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); + } + + // If intervals is non-null and empty at this point, it means that the list of intervals to process + // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since + // this was very likely unintentional, the user should be informed of this. Note that this is different + // from the case where intervals == null, which indicates that there were no interval arguments. + if ( intervals != null && intervals.isEmpty() ) { + logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); + } + + // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome + } + + /** + * Get the sharding strategy given a driving data source. + * + * @param readsDataSource readsDataSource + * @param drivingDataSource Data on which to shard. + * @param intervals intervals + * @return the sharding strategy + */ + protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { + ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); + DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; + ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); + + // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. + if(!readsDataSource.isEmpty()) { + if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) + throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); + if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) + throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); + + if(walker instanceof LocusWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); + } + else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { + // Apply special validation to read pair walkers. + if(walker instanceof ReadPairWalker) { + if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); + if(intervals != null && !intervals.isEmpty()) + throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); + } + + if(intervals == null) + return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); + } + else + throw new ReviewedGATKException("Unable to determine walker type for walker " + walker.getClass().getName()); + } + else { + // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well + // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard + // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] + final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; + if(intervals == null) + return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); + else + return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); + } + } + + protected boolean flashbackData() { + return walker instanceof ReadWalker; + } + + /** + * Create the temp directory if it doesn't exist. + */ + private void initializeTempDirectory() { + File tempDir = new File(System.getProperty("java.io.tmpdir")); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Unable to create directory"); + } + + /** + * Initialize the output streams as specified by the user. + * + * @param outputTracker the tracker supplying the initialization data. + */ + private void initializeOutputStreams(final OutputTracker outputTracker) { + for (final Map.Entry input : getInputs().entrySet()) + outputTracker.addInput(input.getKey(), input.getValue()); + for (final Stub stub : getOutputs()) { + stub.processArguments(argCollection); + outputTracker.addOutput(stub); + } + + outputTracker.prepareWalker(walker, getArguments().strictnessLevel); + } + + public ReferenceDataSource getReferenceDataSource() { + return referenceDataSource; + } + + public GenomeLocParser getGenomeLocParser() { + return genomeLocParser; + } + + /** + * Manage lists of filters. + */ + private final FilterManager filterManager = new FilterManager(); + + private Date startTime = null; // the start time for execution + + public void setParser(ParsingEngine parsingEngine) { + this.parsingEngine = parsingEngine; + } + + /** + * Explicitly set the GenomeLocParser, for unit testing. + * @param genomeLocParser GenomeLocParser to use. + */ + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + /** + * Sets the start time when the execute() function was last called + * @param startTime the start time when the execute() function was last called + */ + protected void setStartTime(Date startTime) { + this.startTime = startTime; + } + + /** + * @return the start time when the execute() function was last called + */ + public Date getStartTime() { + return startTime; + } + + /** + * Setup the intervals to be processed + */ + protected void initializeIntervals() { + intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource.getReference(), argCollection.intervalArguments); + } + + /** + * Add additional, externally managed IO streams for inputs. + * + * @param argumentSource Field into which to inject the value. + * @param value Instance to inject. + */ + public void addInput(ArgumentSource argumentSource, Object value) { + inputs.put(argumentSource, value); + } + + /** + * Add additional, externally managed IO streams for output. + * + * @param stub Instance to inject. + */ + public void addOutput(Stub stub) { + outputs.add(stub); + } + + /** + * Returns the tag associated with a given command-line argument. + * @param key Object for which to inspect the tag. + * @return Tags object associated with the given key, or an empty Tag structure if none are present. + */ + public Tags getTags(Object key) { + return parsingEngine.getTags(key); + } + + protected void initializeDataSources() { + logger.info("Strictness is " + argCollection.strictnessLevel); + + validateSuppliedReference(); + setReferenceDataSource(argCollection.referenceFile); + + validateSuppliedReads(); + initializeReadTransformers(walker); + + final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? + loadSampleRenameMap(argCollection.sampleRenameMappingFile) : + null; + + readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference(), sampleRenameMap); + + for (ReadFilter filter : filters) + filter.initialize(this); + + // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference + rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(), + genomeLocParser,argCollection.unsafe,sampleRenameMap); + } + + /** + * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or + * need to absolutely positively kill everyone in the room) + * @param dataSource + */ + public void setReadsDataSource(final SAMDataSource dataSource) { + this.readsDataSource = dataSource; + } + + /** + * Entry-point function to initialize the samples database from input data and pedigree arguments + */ + private void initializeSampleDB() { + SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); + sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); + sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); + sampleDB = sampleDBBuilder.getFinalSampleDB(); + } + + /** + * Gets a unique identifier for the reader sourcing this read. + * @param read Read to examine. + * @return A unique identifier for the source file of this read. Exception if not found. + */ + public SAMReaderID getReaderIDForRead(final SAMRecord read) { + return getReadsDataSource().getReaderID(read); + } + + /** + * Gets the source file for this read. + * @param id Unique identifier determining which input file to use. + * @return The source filename for this read. + */ + public File getSourceFileForReaderID(final SAMReaderID id) { + return getReadsDataSource().getSAMFile(id); + } + + /** + * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). + * + * @param reads Reads data source. + * @param reference Reference data source. + * @param rods a collection of the reference ordered data tracks + */ + private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { + if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) + return; + + // Compile a set of sequence names that exist in the reference file. + SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); + + if (!reads.isEmpty()) { + // Compile a set of sequence names that exist in the BAM files. + SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); + + if (readsDictionary.size() == 0) { + logger.info("Reads file is unmapped. Skipping validation against reference."); + return; + } + + // compare the reads to the reference + SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, + "reference", referenceDictionary, true, intervals); + } + + for (ReferenceOrderedDataSource rod : rods) + IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); + } + + /** + * Gets a data source for the given set of reads. + * + * @param argCollection arguments + * @param genomeLocParser parser + * @param refReader reader + * @return A data source for the given set of reads. + */ + private SAMDataSource createReadsDataSource(final GATKArgumentCollection argCollection, final GenomeLocParser genomeLocParser, + final IndexedFastaSequenceFile refReader, final Map sampleRenameMap) { + DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); + + // Synchronize the method back into the collection so that it shows up when + // interrogating for the downsampling method during command line recreation. + setDownsamplingMethod(downsamplingMethod); + + logger.info(downsamplingMethod); + + if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) + throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); + + boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); + + if (argCollection.keepProgramRecords) + removeProgramRecords = false; + + final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; + + return new SAMDataSource( + samReaderIDs, + threadAllocation, + argCollection.numberOfBAMFileHandles, + genomeLocParser, + argCollection.useOriginalBaseQualities, + argCollection.strictnessLevel, + argCollection.readBufferSize, + downsamplingMethod, + new ValidationExclusion(Arrays.asList(argCollection.unsafe)), + filters, + readTransformers, + includeReadsWithDeletionAtLoci(), + argCollection.defaultBaseQualities, + removeProgramRecords, + keepReadsInLIBS, + sampleRenameMap, + argCollection.intervalArguments.intervalMerging); + } + + /** + * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory + * HashMap. This file must consist of lines with two whitespace-separated fields, the second of which + * may contain whitespace: + * + * absolute_path_to_file new_sample_name + * + * The engine will verify that each file contains data from only one sample when the on-the-fly sample + * renaming feature is being used. Note that this feature works only with bam and vcf files. + * + * @param sampleRenameMapFile sample rename map file from which to load data + * @return a HashMap containing the contents of the map file, with the keys being the input file paths and + * the values being the new sample names. + */ + protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { + logger.info("Renaming samples from input files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); + + final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); + + try { + for ( final String line : new XReadLines(sampleRenameMapFile) ) { + final String[] tokens = line.split("\\s+", 2); + + if ( tokens.length != 2 ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", + tokens.length, line)); + } + + final File inputFile = new File(tokens[0]); + final String newSampleName = tokens[1].trim(); + + if (newSampleName.contains(VCFConstants.FIELD_SEPARATOR)) { + throw new UserException.MalformedFile(sampleRenameMapFile, String.format( + "Encountered illegal sample name; sample names may not include the VCF field delimiter (%s). Sample name: %s; line: %s", + VCFConstants.FIELD_SEPARATOR, + newSampleName, + line + )); + } + + if ( ! inputFile.isAbsolute() ) { + throw new UserException.MalformedFile(sampleRenameMapFile, "Input file path not absolute at line: " + line); + } + + final String inputFilePath = inputFile.getAbsolutePath(); + + if ( sampleRenameMap.containsKey(inputFilePath) ) { + throw new UserException.MalformedFile(sampleRenameMapFile, + String.format("Input file %s appears more than once", inputFilePath)); + } + + sampleRenameMap.put(inputFilePath, newSampleName); + } + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); + } + + return sampleRenameMap; + } + + + /** + * Opens a reference sequence file paired with an index. Only public for testing purposes + * + * @param refFile Handle to a reference sequence file. Non-null. + */ + public void setReferenceDataSource(File refFile) { + this.referenceDataSource = new ReferenceDataSource(refFile); + genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); + } + + /** + * Open the reference-ordered data sources. + * + * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. + * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. + * @param genomeLocParser to use when creating and validating GenomeLocs. + * @param validationExclusionType potentially indicate which validations to include / exclude. + * @param sampleRenameMap map of file -> new sample name used when doing on-the-fly sample renaming + * + * @return A list of reference-ordered data sources. + */ + private List getReferenceOrderedDataSources(final Collection referenceMetaDataFiles, + final SAMSequenceDictionary sequenceDictionary, + final GenomeLocParser genomeLocParser, + final ValidationExclusion.TYPE validationExclusionType, + final Map sampleRenameMap) { + final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, + getArguments().disableAutoIndexCreationAndLockingWhenReadingRods, + sampleRenameMap); + + final List dataSources = new ArrayList(); + for (RMDTriplet fileDescriptor : referenceMetaDataFiles) + dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, + builder, + sequenceDictionary, + genomeLocParser, + flashbackData())); + + return dataSources; + } + + /** + * Returns the SAM File Header from the input reads' data source file + * @return the SAM File Header from the input reads' data source file + */ + public SAMFileHeader getSAMFileHeader() { + return readsDataSource.getHeader(); + } + + public boolean lenientVCFProcessing() { + return ValidationExclusion.lenientVCFProcessing(argCollection.unsafe); + } + + /** + * Returns the unmerged SAM file header for an individual reader. + * @param reader The reader. + * @return Header for that reader or null if not available. + */ + public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { + return readsDataSource == null ? null : readsDataSource.getHeader(reader); + } + + /** + * Returns an ordered list of the unmerged SAM file headers known to this engine. + * @return list of header for each input SAM file, in command line order + */ + public List getSAMFileHeaders() { + final List headers = new ArrayList(); + for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { + headers.add(getReadsDataSource().getHeader(id)); + } + return headers; + } + + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + + /** + * Returns data source object encapsulating all essential info and handlers used to traverse + * reads; header merger, individual file readers etc can be accessed through the returned data source object. + * + * @return the reads data source + */ + public SAMDataSource getReadsDataSource() { + return this.readsDataSource; + } + + /** + * Sets the collection of GATK main application arguments. + * + * @param argCollection the GATK argument collection + */ + public void setArguments(GATKArgumentCollection argCollection) { + this.argCollection = argCollection; + } + + /** + * Gets the collection of GATK main application arguments. + * + * @return the GATK argument collection + */ + public GATKArgumentCollection getArguments() { + return this.argCollection; + } + + /** + * Get the list of intervals passed to the engine. + * @return List of intervals, or null if no intervals are in use + */ + public GenomeLocSortedSet getIntervals() { + return this.intervals; + } + + /** + * Get the list of regions of the genome being processed. If the user + * requested specific intervals, return those, otherwise return regions + * corresponding to the entire genome. Never returns null. + * + * @return a non-null set of intervals being processed + */ + @Ensures("result != null") + public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { + if ( getIntervals() == null ) + // if we don't have any intervals defined, create intervals from the reference itself + return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); + else + return getIntervals(); + } + + /** + * Gets the list of filters employed by this engine. + * @return Collection of filters (actual instances) used by this engine. + */ + public Collection getFilters() { + return this.filters; + } + + /** + * Sets the list of filters employed by this engine. + * @param filters Collection of filters (actual instances) used by this engine. + */ + public void setFilters(Collection filters) { + this.filters = filters; + } + + /** + * Gets the filter manager for this engine. + * @return filter manager for this engine. + */ + protected FilterManager getFilterManager() { + return filterManager; + } + + /** + * Gets the input sources for this engine. + * @return input sources for this engine. + */ + protected Map getInputs() { + return inputs; + } + + /** + * Gets the output stubs for this engine. + * @return output stubs for this engine. + */ + protected Collection> getOutputs() { + return outputs; + } + + /** + * Returns data source objects encapsulating all rod data; + * individual rods can be accessed through the returned data source objects. + * + * @return the rods data sources, never {@code null}. + */ + public List getRodDataSources() { + return this.rodDataSources; + } + + /** + * Gets cumulative metrics about the entire run to this point. + * Returns a clone of this snapshot in time. + * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is + * owned by the caller; the caller can do with the object what they wish. + */ + public ReadMetrics getCumulativeMetrics() { + // todo -- probably shouldn't be lazy + if ( cumulativeMetrics == null ) + cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); + return cumulativeMetrics; + } + + /** + * Return the global ThreadEfficiencyMonitor, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + // ------------------------------------------------------------------------------------- + // + // code for working with Samples database + // + // ------------------------------------------------------------------------------------- + + public SampleDB getSampleDB() { + return this.sampleDB; + } + + public Map getApproximateCommandLineArguments(Object... argumentProviders) { + return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); + } + + public String createApproximateCommandLineArgumentString(Object... argumentProviders) { + return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); + } + + // ------------------------------------------------------------------------------------- + // + // code for working with progress meter + // + // ------------------------------------------------------------------------------------- + + /** + * Register the global progress meter with this engine + * + * Calling this function more than once will result in an IllegalStateException + * + * @param meter a non-null progress meter + */ + public void registerProgressMeter(final ProgressMeter meter) { + if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); + if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); + + progressMeter = meter; + } + + /** + * Get the progress meter being used by this engine. May be null if no meter has been registered yet + * @return a potentially null pointer to the progress meter + */ + public ProgressMeter getProgressMeter() { + return progressMeter; + } + + /** + * Does the current runtime in unit exceed the runtime limit, if one has been provided? + * + * @return false if not limit was requested or if runtime <= the limit, true otherwise + */ + public boolean exceedsRuntimeLimit() { + if ( progressMeter == null ) + // not yet initialized or not set because of testing + return false; + + if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) + return false; + else { + final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); + if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); + final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); + return runtime > maxRuntimeNano; + } + } + + /** + * @return the runtime limit in nanoseconds, or -1 if no limit was specified + */ + public long getRuntimeLimitInNanoseconds() { + return runtimeLimitInNanoseconds; + } + + /** + * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds + * as appropriate + * + * @param args the GATKArgumentCollection to retrieve our runtime limits from + */ + private void setupRuntimeLimits(final GATKArgumentCollection args) { + if ( args.maxRuntime == NO_RUNTIME_LIMIT ) + runtimeLimitInNanoseconds = -1; + else if (args.maxRuntime < 0 ) + throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); + else { + runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); + } + } + + /** + * Returns the sample list including all samples. + * @return never {@code null}. + */ + public SampleList getSampleList() { + return new IndexedSampleList(getSampleDB().getSampleNames()); + } + + /** + * Returns the sample list including samples in read inputs. + * @return never {@code null}. + */ + public SampleList getReadSampleList() { + return new IndexedSampleList(ReadUtils.getSAMFileSamples(getSAMFileHeader())); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java new file mode 100644 index 000000000..0f6aee60c --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java @@ -0,0 +1,197 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.ValidationStringency; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; + +import java.util.Collection; +import java.util.List; +/** + * User: hanna + * Date: May 14, 2009 + * Time: 4:06:26 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A data structure containing information about the reads data sources as well as + * information about how they should be downsampled, sorted, and filtered. + */ +public class ReadProperties { + private final Collection readers; + private final SAMFileHeader header; + private final SAMFileHeader.SortOrder sortOrder; + private final ValidationStringency validationStringency; + private final DownsamplingMethod downsamplingMethod; + private final ValidationExclusion exclusionList; + private final Collection supplementalFilters; + private final List readTransformers; + private final boolean keepUniqueReadListInLIBS; + private final boolean includeReadsWithDeletionAtLoci; + private final boolean useOriginalBaseQualities; + private final byte defaultBaseQualities; + + /** + * Return true if the walker wants to see reads that contain deletions when looking at locus pileups + * + * @return + */ + public boolean includeReadsWithDeletionAtLoci() { + return includeReadsWithDeletionAtLoci; + } + + public boolean keepUniqueReadListInLIBS() { + return keepUniqueReadListInLIBS; + } + + /** + * Gets a list of the files acting as sources of reads. + * @return A list of files storing reads data. + */ + public Collection getSAMReaderIDs() { + return readers; + } + + /** + * Gets the sam file header + * @return the sam file header + */ + public SAMFileHeader getHeader() { + return header; + } + + /** + * Gets the sort order of the reads + * @return the sort order of the reads + */ + public SAMFileHeader.SortOrder getSortOrder() { + return sortOrder; + } + + /** + * How strict should validation be? + * @return Stringency of validation. + */ + public ValidationStringency getValidationStringency() { + return validationStringency; + } + + /** + * Gets the method and parameters used when downsampling reads. + * @return Downsample fraction. + */ + public DownsamplingMethod getDownsamplingMethod() { + return downsamplingMethod; + } + + /** + * Return whether to 'verify' the reads as we pass through them. + * @return Whether to verify the reads. + */ + public ValidationExclusion getValidationExclusionList() { + return exclusionList; + } + + public Collection getSupplementalFilters() { + return supplementalFilters; + } + + + public List getReadTransformers() { + return readTransformers; + } + + /** + * Return whether to use original base qualities. + * @return Whether to use original base qualities. + */ + public boolean useOriginalBaseQualities() { + return useOriginalBaseQualities; + } + + /** + * @return Default base quality value to fill reads missing base quality information. + */ + public byte defaultBaseQualities() { + return defaultBaseQualities; + } + + /** + * Extract the command-line arguments having to do with reads input + * files and store them in an easy-to-work-with package. Constructor + * is package protected. + * @param samFiles list of reads files. + * @param header sam file header. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param strictness Stringency of reads file parsing. + * @param downsamplingMethod Method for downsampling reads at a given locus. + * @param exclusionList what safety checks we're willing to let slide + * @param supplementalFilters additional filters to dynamically apply. + * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method + * will explicitly list reads with deletion over the current reference base; otherwise, only observed + * bases will be seen in the pileups, and the deletions will be skipped silently. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param keepUniqueReadListInLIBS If true, we will tell LocusIteratorByState to track the unique reads it sees + * This is really useful for ActiveRegionTraversals + */ + public ReadProperties( Collection samFiles, + SAMFileHeader header, + SAMFileHeader.SortOrder sortOrder, + boolean useOriginalBaseQualities, + ValidationStringency strictness, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + List readTransformers, + boolean includeReadsWithDeletionAtLoci, + byte defaultBaseQualities, + final boolean keepUniqueReadListInLIBS) { + this.readers = samFiles; + this.header = header; + this.sortOrder = sortOrder; + this.validationStringency = strictness; + this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; + this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; + this.supplementalFilters = supplementalFilters; + this.readTransformers = readTransformers; + this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; + this.useOriginalBaseQualities = useOriginalBaseQualities; + this.defaultBaseQualities = defaultBaseQualities; + this.keepUniqueReadListInLIBS = keepUniqueReadListInLIBS; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/SampleUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/SampleUtils.java new file mode 100644 index 000000000..eb98e0bb4 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/SampleUtils.java @@ -0,0 +1,258 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import htsjdk.variant.vcf.VCFHeader; +import org.broadinstitute.gatk.utils.text.ListFileUtils; +import org.broadinstitute.gatk.utils.text.XReadLines; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +/** + * SampleUtils is a static class (no instantiation allowed!) with some utility methods for getting samples + * quality scores. + * + * @author ebanks + */ +public class SampleUtils { + /** + * Private constructor. No instantiating this class! + */ + private SampleUtils() {} + + /** + * Gets all of the unique sample names from all VCF rods input by the user + * + * @param toolkit GATK engine + * + * @return the set of unique samples + */ + public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) { + return getUniqueSamplesFromRods(toolkit, null); + } + + /** + * Gets all of the unique sample names from the set of provided VCF rod names input by the user + * + * @param toolkit GATK engine + * @param rodNames list of rods to use; if null, uses all VCF rods + * + * @return the set of unique samples + */ + public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { + Set samples = new LinkedHashSet<>(); + + for ( VCFHeader header : GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames).values() ) + samples.addAll(header.getGenotypeSamples()); + + return samples; + } + + public static Set getRodNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { + return GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames).keySet(); + } + + public static Set getSampleListWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { + return getSampleList(GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames)); + } + + public static Set getSampleList(Map headers) { + return getSampleList(headers, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE); + } + + public static Set getSampleList(Map headers, GATKVariantContextUtils.GenotypeMergeType mergeOption) { + Set samples = new TreeSet(); + for ( Map.Entry val : headers.entrySet() ) { + VCFHeader header = val.getValue(); + for ( String sample : header.getGenotypeSamples() ) { + samples.add(GATKVariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY)); + } + } + + return samples; + } + + + /** + * + * @param VCF_Headers + * @return false if there are names duplication between the samples names in the VCF headers + */ + public static boolean verifyUniqueSamplesNames(Map VCF_Headers) { + Set samples = new HashSet(); + for ( Map.Entry val : VCF_Headers.entrySet() ) { + VCFHeader header = val.getValue(); + for ( String sample : header.getGenotypeSamples() ) { + if (samples.contains(sample)){ + + return false; + } + samples.add(sample); + } + } + + return true; + } + + /** + * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap + * (e.g. sampleX.1, sampleX.2, ...) + * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping + * from rod/sample pairs to the new uniquified names + * + * @param toolkit GATK engine + * @param samples set to store the sample names + * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names + */ + public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { + + // keep a map of sample name to occurrences encountered + HashMap sampleOverlapMap = new HashMap(); + + // iterate to get all of the sample names + + for ( Map.Entry pair : GATKVCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) { + for ( String sample : pair.getValue().getGenotypeSamples() ) + addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); + } + } + + private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { + + // how many occurrences have we seen so far? + Integer occurrences = sampleOverlapMap.get(newSample); + + // if this is the first one, just add it to the list of samples + if ( occurrences == null ) { + samples.add(newSample); + rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); + sampleOverlapMap.put(newSample, 1); + } + + // if it's already been seen multiple times, give it a unique suffix and increment the value + else if ( occurrences >= 2 ) { + String uniqueName = newSample + "." + rodName; + samples.add(uniqueName); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); + sampleOverlapMap.put(newSample, occurrences + 1); + } + + // if this is the second occurrence of the sample name, uniquify both of them + else { // occurrences == 2 + + // remove the 1st occurrence, uniquify it, and add it back + samples.remove(newSample); + String uniqueName1 = null; + for ( Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { + if ( entry.getValue().equals(newSample) ) { + uniqueName1 = newSample + "." + entry.getKey().first; + entry.setValue(uniqueName1); + break; + } + } + samples.add(uniqueName1); + + // add the second one + String uniqueName2 = newSample + "." + rodName; + samples.add(uniqueName2); + rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); + + sampleOverlapMap.put(newSample, 2); + } + + } + + /** + * Returns a new set of samples, containing a final list of samples expanded from sampleArgs + * + * Each element E of sampleArgs can either be a literal sample name or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique sample names. + * + * @param sampleArgs args + * @return samples + */ + public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { + if (sampleArgs != null) { + return ListFileUtils.unpackSet(sampleArgs); + } + + return new HashSet(); + } + + public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { + Set samples = ListFileUtils.unpackSet(vcfSamples); + if (sampleExpressions == null) { + return samples; + } else { + return ListFileUtils.includeMatching(samples, sampleExpressions, false); + } + } + + /** + * Given a collection of samples and a collection of regular expressions, generates the set of samples that match each expression + * @param originalSamples list of samples to select samples from + * @param sampleExpressions list of expressions to use for matching samples + * @return the set of samples from originalSamples that satisfy at least one of the expressions in sampleExpressions + */ + public static Collection matchSamplesExpressions (Collection originalSamples, Collection sampleExpressions) { + // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions + Set samples = new HashSet(); + if (sampleExpressions != null) { + samples.addAll(ListFileUtils.includeMatching(originalSamples, sampleExpressions, false)); + } + return samples; + } + + /** + * Given a list of files with sample names it reads all files and creates a list of unique samples from all these files. + * @param files list of files with sample names in + * @return a collection of unique samples from all files + */ + public static Collection getSamplesFromFiles (Collection files) { + Set samplesFromFiles = new HashSet(); + if (files != null) { + for (File file : files) { + try { + XReadLines reader = new XReadLines(file); + List lines = reader.readLines(); + for (String line : lines) { + samplesFromFiles.add(line); + } + } catch (FileNotFoundException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + } + } + return samplesFromFiles; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java new file mode 100644 index 000000000..9ea5a3c31 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java @@ -0,0 +1,442 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.commandline.Hidden; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.filters.FilterManager; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.ResourceBundleExtractorDoclet; +import org.broadinstitute.gatk.utils.text.TextFormattingUtils; + +import java.lang.annotation.Annotation; +import java.util.*; + +/** + * Plugin manager that also provides various utilities for inspecting Walkers. + */ +public class WalkerManager extends PluginManager { + + /** + * A collection of help text for walkers and their enclosing packages. + */ + private ResourceBundle helpText; + + public WalkerManager() { + super(Walker.class,"walker",""); + helpText = TextFormattingUtils.loadResourceBundle("GATKText"); + } + + /** + * Get the list of walkers currently available to the GATK, organized + * by package. + * @param visibleWalkersOnly If true, return only the walker names that aren't hidden. + * @return Names of currently available walkers. + */ + public Map>> getWalkerNamesByPackage(boolean visibleWalkersOnly) { + Map>> walkersByPackage = new HashMap>>(); + for(Class walker: getPlugins()) { + if(visibleWalkersOnly && isHidden(walker)) + continue; + + // Extract the name for the package; if the walker is in the unnamed package, use the empty string + String walkerPackage = walker.getPackage() != null ? walker.getPackage().getName() : ""; + if(!walkersByPackage.containsKey(walkerPackage)) + walkersByPackage.put(walkerPackage,new ArrayList>()); + walkersByPackage.get(walkerPackage).add(walker); + } + return Collections.unmodifiableMap(walkersByPackage); + } + + /** + * Gets the display name for a given package. + * @param packageName Fully qualified package name. + * @return A suitable display name for the package. + */ + public String getPackageDisplayName(String packageName) { + // ...try to compute the override from the text of the package name, while accounting for + // unpackaged walkers. + String displayName = packageName.substring(packageName.lastIndexOf('.')+1); + if (displayName.trim().equals("")) displayName = ""; + return displayName; + } + + /** + * Gets the help text associated with a given package name. + * @param packageName Package for which to search for help text. + * @return Package help text, or "" if none exists. + */ + public String getPackageSummaryText(String packageName) { + String key = String.format("%s.%s",packageName, ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); + if(!helpText.containsKey(key)) + return ""; + return helpText.getString(key); + } + + /** + * Gets the summary help text associated with a given walker type. + * @param walkerType Type of walker for which to search for help text. + * @return Walker summary description, or "" if none exists. + */ + public String getWalkerSummaryText(Class walkerType) { + String walkerSummary = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); + if(!helpText.containsKey(walkerSummary)) + return ""; + return helpText.getString(walkerSummary); + } + + /** + * Gets the summary help text associated with a given walker type. + * @param walker Walker for which to search for help text. + * @return Walker summary description, or "" if none exists. + */ + public String getWalkerSummaryText(Walker walker) { + return getWalkerSummaryText(walker.getClass()); + } + + /** + * Gets the descriptive help text associated with a given walker type. + * @param walkerType Type of walker for which to search for help text. + * @return Walker full description, or "" if none exists. + */ + public String getWalkerDescriptionText(Class walkerType) { + String walkerDescription = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.DESCRIPTION_TAGLET_NAME); + if(!helpText.containsKey(walkerDescription)) + return ""; + return helpText.getString(walkerDescription); + } + + /** + * Gets the descriptive help text associated with a given walker type. + * @param walker Walker for which to search for help text. + * @return Walker full description, or "" if none exists. + */ + public String getWalkerDescriptionText(Walker walker) { + return getWalkerDescriptionText(walker.getClass()); + } + + /** + * Retrieves the walker class given a walker name. + * @param walkerName Name of the walker. + * @return Class representing the walker. + */ + public Class getWalkerClassByName(String walkerName) { + return getPluginsByName().get(walkerName); + } + + /** + * Rather than use the default exception, return a MalformedWalkerArgumentsException. + * @param errorMessage error message from formatErrorMessage() + * @return - A MalformedWalkerArgumentsException with errorMessage + */ + @Override + protected UserException createMalformedArgumentException(final String errorMessage) { + return new UserException.MalformedWalkerArgumentsException(errorMessage); + } + + /** + * Gets the data source for the provided walker. + * @param walkerClass The class of the walker. + * @return Which type of data source to traverse over...reads or reference? + */ + public static DataSource getWalkerDataSource(Class walkerClass) { + By byDataSource = walkerClass.getAnnotation(By.class); + if( byDataSource == null ) + throw new ReviewedGATKException("Unable to find By annotation for walker class " + walkerClass.getName()); + return byDataSource.value(); + } + + /** + * Gets the data source for the provided walker. + * @param walker The walker. + * @return Which type of data source to traverse over...reads or reference? + */ + public static DataSource getWalkerDataSource(Walker walker) { + return getWalkerDataSource(walker.getClass()); + } + + /** + * Get a list of RODs allowed by the walker. + * @param walkerClass Class of the walker to query. + * @return The list of allowed reference meta data. + */ + public static List getAllowsMetaData(Class walkerClass) { + return Collections.emptyList(); + } + + /** + * Determine whether the given walker supports the given data source. + * @param walkerClass Class of the walker to query. + * @param dataSource Source to check for . + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Class walkerClass, DataSource dataSource) { + Allows allowsDataSource = getWalkerAllowed(walkerClass); + + // Allows is less restrictive than requires. If an allows + // clause is not specified, any kind of data is allowed. + if( allowsDataSource == null ) + return true; + + return Arrays.asList(allowsDataSource.value()).contains(dataSource); + } + + /** + * Determine whether the given walker supports the given data source. + * @param walker Walker to query. + * @param dataSource Source to check for . + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Walker walker, DataSource dataSource) { + return isAllowed(walker.getClass(), dataSource); + } + + /** + * Determine whether the given walker supports the given reference ordered data. + * @param walkerClass Class of the walker to query. + * @param rod Source to check. + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Class walkerClass, ReferenceOrderedDataSource rod) { + return true; + } + + /** + * Determine whether the given walker supports the given reference ordered data. + * @param walker Walker to query. + * @param rod Source to check. + * @return True if the walker forbids this data type. False otherwise. + */ + public static boolean isAllowed(Walker walker, ReferenceOrderedDataSource rod) { + return isAllowed(walker.getClass(), rod); + } + + /** + * Determine whether the given walker requires the given data source. + * @param walkerClass Class of the walker to query. + * @param dataSource Source to check for. + * @return True if the walker allows this data type. False otherwise. + */ + public static boolean isRequired(Class walkerClass, DataSource dataSource) { + Requires requiresDataSource = getWalkerRequirements(walkerClass); + return Arrays.asList(requiresDataSource.value()).contains(dataSource); + } + + /** + * Determine whether the given walker requires the given data source. + * @param walker Walker to query. + * @param dataSource Source to check for. + * @return True if the walker allows this data type. False otherwise. + */ + public static boolean isRequired(Walker walker, DataSource dataSource) { + return isRequired(walker.getClass(), dataSource); + } + + /** + * Get a list of RODs required by the walker. + * @param walkerClass Class of the walker to query. + * @return The list of required reference meta data. + */ + public static List getRequiredMetaData(Class walkerClass) { + return Collections.emptyList(); + } + + /** + * Get a list of RODs required by the walker. + * @param walker Walker to query. + * @return The list of required reference meta data. + */ + public static List getRequiredMetaData(Walker walker) { + return getRequiredMetaData(walker.getClass()); + } + + /** + * Reports whether this walker type is hidden -- in other words, whether it'll appear in the help output. + * @param walkerType Class to test for visibility. + * @return True if the walker should be hidden. False otherwise. + */ + public static boolean isHidden(Class walkerType) { + return walkerType.isAnnotationPresent(Hidden.class); + } + + /** + * Extracts filters that the walker has requested be run on the dataset. + * @param walkerClass Class of the walker to inspect for filtering requests. + * @param filterManager Manages the creation of filters. + * @return A non-empty list of filters to apply to the reads. + */ + public static List getReadFilters(Class walkerClass, FilterManager filterManager) { + List filters = new ArrayList(); + for(Class filterType: getReadFilterTypes(walkerClass)) + filters.add(filterManager.createFilterByType(filterType)); + return filters; + } + + /** + * Extracts filters that the walker has requested be run on the dataset. + * @param walker Walker to inspect for filtering requests. + * @param filterManager Manages the creation of filters. + * @return A non-empty list of filters to apply to the reads. + */ + public static List getReadFilters(Walker walker, FilterManager filterManager) { + return getReadFilters(walker.getClass(), filterManager); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walker The walker to interrogate. + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod( Walker walker ) { + return getDownsamplingMethod(walker.getClass()); + } + + /** + * Gets the type of downsampling method requested by the walker. If an alternative + * downsampling method is specified on the command-line, the command-line version will + * be used instead. + * @param walkerClass The class of the walker to interrogate. + * @return The downsampling method, as specified by the walker. Null if none exists. + */ + public static DownsamplingMethod getDownsamplingMethod( Class walkerClass ) { + DownsamplingMethod downsamplingMethod = null; + + if( walkerClass.isAnnotationPresent(Downsample.class) ) { + Downsample downsampleParameters = walkerClass.getAnnotation(Downsample.class); + DownsampleType type = downsampleParameters.by(); + Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; + Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; + downsamplingMethod = new DownsamplingMethod(type, toCoverage, toFraction); + } + + return downsamplingMethod; + } + + public static T getWalkerAnnotation(final Walker walker, final Class clazz) { + return walker.getClass().getAnnotation(clazz); + } + + public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) { + return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); + } + + /** + * Create a name for this type of walker. + * + * @param walkerType The type of walker. + * @return A name for this type of walker. + */ + @Override + public String getName(Class walkerType) { + String walkerName = ""; + + if (walkerType.getAnnotation(WalkerName.class) != null) + walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim(); + else + walkerName = super.getName(walkerType); + + return walkerName; + } + + /** + * Utility to get the requires attribute from the walker. + * Throws an exception if requirements are missing. + * @param walkerClass Class of the walker to query for required data. + * @return Required data attribute. + */ + private static Requires getWalkerRequirements(Class walkerClass) { + Requires requiresDataSource = walkerClass.getAnnotation(Requires.class); + if( requiresDataSource == null ) + throw new ReviewedGATKException( "Unable to find data types required by walker class " + walkerClass.getName()); + return requiresDataSource; + } + + /** + * Utility to get the requires attribute from the walker. + * Throws an exception if requirements are missing. + * @param walker Walker to query for required data. + * @return Required data attribute. + */ + private static Requires getWalkerRequirements(Walker walker) { + return getWalkerRequirements(walker.getClass()); + } + + /** + * Utility to get the forbidden attribute from the walker. + * @param walkerClass Class of the walker to query for required data. + * @return Required data attribute. Null if forbidden info isn't present. + */ + private static Allows getWalkerAllowed(Class walkerClass) { + Allows allowsDataSource = walkerClass.getAnnotation(Allows.class); + return allowsDataSource; + } + + /** + * Utility to get the forbidden attribute from the walker. + * @param walker Walker to query for required data. + * @return Required data attribute. Null if forbidden info isn't present. + */ + private static Allows getWalkerAllowed(Walker walker) { + return getWalkerAllowed(walker.getClass()); + } + + /** + * Gets the list of filtering classes specified as walker annotations. + * @param walkerClass Class of the walker to inspect. + * @return An array of types extending from SamRecordFilter. Will never be null. + */ + public static Collection> getReadFilterTypes(Class walkerClass) { + List> filterTypes = new ArrayList>(); + while(walkerClass != null) { + if(walkerClass.isAnnotationPresent(ReadFilters.class)) { + for ( Class c : walkerClass.getAnnotation(ReadFilters.class).value() ) { + if( !filterTypes.contains(c) ) + filterTypes.add(c); + } + } + walkerClass = walkerClass.getSuperclass(); + } + return filterTypes; + } + + /** + * Gets the list of filtering classes specified as walker annotations. + * @param walker The walker to inspect. + * @return An array of types extending from SamRecordFilter. Will never be null. + */ + public static Collection> getReadFilterTypes(Walker walker) { + return getReadFilterTypes(walker.getClass()); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Aligner.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/Alignment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAAligner.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWAConfiguration.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/BWTFiles.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignerTestHarness.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentMatchSequence.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/AlignmentState.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAAlignment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/BWAJavaAligner.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/bwa/java/LowerBound.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/AMBWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/ANNWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWT.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTReader.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTSupplementaryFileGenerator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/BWTWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Bases.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/Counts.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/CreateBWTFromReference.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SequenceBlock.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArray.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayReader.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/bwt/SuffixArrayWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedInputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/BasePackedOutputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/CreatePACFromReference.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/PackUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedInputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/alignment/reference/packing/UnsignedIntPackedOutputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/DbsnpArgumentCollection.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java new file mode 100644 index 000000000..575c195b4 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -0,0 +1,629 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import htsjdk.samtools.ValidationStringency; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.engine.samples.PedigreeValidationType; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; +import org.broadinstitute.gatk.engine.GATKVCFUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * @author aaron + * @version 1.0 + */ +public class GATKArgumentCollection { + + /** the constructor */ + public GATKArgumentCollection() { + } + + // parameters and their defaults + /** + * An input file containing sequence data mapped to a reference, in SAM or BAM format, or a text file containing a + * list of input files (with extension .list). Note that the GATK requires an accompanying index for each SAM or + * BAM file. Please see our online documentation for more details on input formatting requirements. + */ + @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) + public List samFiles = new ArrayList<>(); + + @Hidden + @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") + public Boolean showFullBamList = false; + + @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) + public Integer readBufferSize = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // GATKRunReport options + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic + * statistics about the run (which tool was used, whether the run was successful etc.) that help us for debugging + * and development. Up to version 3.2-2 the run report contains a record of the username and hostname associated + * with the run, but it does **NOT** contain any information that could be used to identify patient data. + * Nevertheless, if your data is subject to stringent confidentiality clauses (no outside communication) or if your + * run environment is not connected to the internet, you can disable the reporting system by seeting this option to + * "NO_ET". You will also need to request a key using the online request form on our website (se FAQs). + */ + @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) + public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; + /** + * Please see the "phone_home" argument below and the online documentation FAQs for more details on the key system + * and how to request a key. + */ + @Argument(fullName = "gatk_key", shortName = "K", doc="GATK key file required to run with -et NO_ET", required = false) + public File gatkKeyFile = null; + + /** + * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be + * used to group together runs during later analysis. One use of this capability is to tag runs as GATK + * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. + * + * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find + * meaningful. + */ + @Argument(fullName = "tag", shortName = "tag", doc="Tag to identify this GATK run as part of a group of runs", required = false) + public String tag = "NA"; + + // -------------------------------------------------------------------------------------------------------------- + // + // General features + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Reads that fail the specified filters will not be used in the analysis. Multiple filters can be specified separately, + * e.g. you can do -rf MalformedRead -rf BadCigar and so on. Available read filters are listed in the online tool + * documentation. Note that the read name format is e.g. MalformedReadFilter, but at the command line the filter + * name should be given without the Filter suffix; e.g. -rf MalformedRead (NOT -rf MalformedReadFilter, which is not + * recognized by the program). Note also that some read filters are applied by default for some analysis tools; this + * is specified in each tool's documentation. The default filters cannot be disabled. + */ + @Argument(fullName = "read_filter", shortName = "rf", doc = "Filters to apply to reads before analysis", required = false) + public final List readFilters = new ArrayList<>(); + + @ArgumentCollection + public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); + /** + * The reference genome against which the sequence data was mapped. The GATK requires an index file and a dictionary + * file accompanying the reference (please see the online documentation FAQs for more details on these files). Although + * this argument is indicated as being optional, almost all GATK tools require a reference in order to run. + * Note also that while GATK can in theory process genomes from any organism with any number of chromosomes or contigs, + * it is not designed to process draft genome assemblies and performance will decrease as the number of contigs in + * the reference increases. We strongly discourage the use of unfinished genome assemblies containing more than a few + * hundred contigs. Contig numbers in the thousands will most probably cause memory-related crashes. + */ + @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) + public File referenceFile = null; + /** + * If this flag is enabled, the random numbers generated will be different in every run, causing GATK to behave non-deterministically. + */ + @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Use a non-deterministic random seed", required = false) + public boolean nonDeterministicRandomSeed = false; + /** + * To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator. + */ + @Hidden + @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests.") + public boolean disableDithering = false; + /** + * This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument. + */ + @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false) + public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; + + @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false) + public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; + + // -------------------------------------------------------------------------------------------------------------- + // + // Downsampling Arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis. + * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools + * specify a default downsampling type and target, but this behavior can be overridden from command line using the + * downsampling arguments. + */ + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) + public DownsampleType downsamplingType = null; + /** + * Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of + * the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling + * is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of + * sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target + * coverage you need to aim for in order to obtain enough coverage in all loci of interest. + */ + @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction of reads to downsample to", required = false, minValue = 0.0, maxValue = 1.0) + public Double downsampleFraction = null; + + /** + * The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to + * get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes + * unreasonable computational costs. The downsampling process takes two different forms depending on the type of + * analysis it is used with. + * + * For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), + * downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals + * (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start + * position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers + * to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available + * reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation + * of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of + * reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be + * met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than + * requested. + */ + @Argument(fullName = "downsample_to_coverage", shortName = "dcov", + doc = "Target coverage threshold for downsampling to coverage", + required = false, minValue = 0) + public Integer downsampleCoverage = null; + + /** + * Gets the downsampling method explicitly specified by the user. If the user didn't specify + * a default downsampling mechanism, return the default. + * @return The explicitly specified downsampling mechanism, or the default if none exists. + */ + public DownsamplingMethod getDownsamplingMethod() { + if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) + return null; + + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); + } + + /** + * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. + * @param method The downsampling mechanism. + */ + public void setDownsamplingMethod(DownsamplingMethod method) { + if (method == null) + throw new IllegalArgumentException("method is null"); + + downsamplingType = method.type; + downsampleCoverage = method.toCoverage; + downsampleFraction = method.toFraction; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // BAQ arguments + // + // -------------------------------------------------------------------------------------------------------------- + @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) + public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; + /** + * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. + */ + @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) + public double BAQGOP = BAQ.DEFAULT_GOP; + + // -------------------------------------------------------------------------------------------------------------- + // + // refactor NDN cigar string arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in + * a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms. + * You should only use this if you know that your reads have that problem. + */ + @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false) + public boolean REFACTOR_NDN_CIGAR_READS = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // quality encoding checking arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * By default the GATK assumes that base quality scores start at Q0 == ASCII 33 according to the SAM specification. + * However, encoding in some datasets (especially older Illumina ones) starts at Q64. This argument will fix the + * encodings on the fly (as the data is read in) by subtracting 31 from every quality score. Note that this argument should + * NEVER be used by default; you should only use it when you have confirmed that the quality scores in your data are + * not in the correct encoding. + */ + @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) + public boolean FIX_MISENCODED_QUALS = false; + /** + * This flag tells GATK to ignore warnings when encountering base qualities that are too high and that seemingly + * indicate a problem with the base quality encoding of the BAM file. You should only use this if you really know + * what you are doing; otherwise you could seriously mess up your data and ruin your analysis. + */ + @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Ignore warnings about base quality score encoding", required = false) + public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; + /** + * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which + * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ + * tag is present for a read, the standard qual score will be used. + */ + @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "Use the base quality scores from the OQ tag", required=false) + public Boolean useOriginalBaseQualities = false; + /** + * If reads are missing some or all base quality scores, this value will be used for all base quality scores. + * By default this is set to -1 to disable default base quality assignment. + */ + @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "Assign a default base quality", required=false, minValue = 0, maxValue = Byte.MAX_VALUE) + public byte defaultBaseQualities = -1; + + // -------------------------------------------------------------------------------------------------------------- + // + // performance log arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * The file name for the GATK performance log output, or null if you don't want to generate the + * detailed performance logging table. This table is suitable for importing into R or any + * other analysis software that can read tsv files. + */ + @Argument(fullName = "performanceLog", shortName="PF", doc="Write GATK runtime performance log to this file", required = false) + public File performanceLog = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // BQSR arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Enables on-the-fly recalibrate of base qualities, intended primarily for use with BaseRecalibrator and PrintReads + * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool. + * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s). + */ + @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Input covariates table file for on-the-fly base quality score recalibration") + public File BQSR_RECAL_FILE = null; + + /** + * Turns on the base quantization module. It requires a recalibration report (-BQSR). + * + * A value of 0 here means "do not quantize". + * Any value greater than zero will be used to recalculate the quantization using that many levels. + * Negative values mean that we should quantize using the recalibration report's quantization level. + */ + @Hidden + @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) + public int quantizationLevels = 0; + + /** + * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced. + */ + @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "Disable printing of base insertion and deletion tags (with -BQSR)", required=false) + public boolean disableIndelQuals = false; + + /** + * By default, the OQ tag in not emitted when using the -BQSR argument. Use this flag to include OQ tags in the output BAM file. + * Note that this may results in significant file size increase. + */ + @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "Emit the OQ tag with the original base qualities (with -BQSR)", required=false) + public boolean emitOriginalQuals = false; + + /** + * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in the recalibrated BAM file. + * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. + * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, + * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, + * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. + */ + @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) + public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; + /** + * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. + */ + @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) + public double globalQScorePrior = -1.0; + + + // -------------------------------------------------------------------------------------------------------------- + // + // Other utility arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Keep in mind that if you set this to LENIENT, we may refuse to provide you with support if anything goes wrong. + */ + @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) + public ValidationStringency strictnessLevel = ValidationStringency.SILENT; + /** + * Some tools keep program records in the SAM header by default. Use this argument to override that behavior and discard program records for the SAM header. + */ + @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Remove program records from the SAM header", required = false) + public boolean removeProgramRecords = false; + /** + * Some tools discard program records from the SAM header by default. Use this argument to override that behavior and keep program records in the SAM header. + */ + @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Keep program records in the SAM header", required = false) + public boolean keepProgramRecords = false; + + /** + * On-the-fly sample renaming works only with single-sample BAM and VCF files. Each line of the mapping file must + * contain the absolute path to a BAM or VCF file, followed by whitespace, followed by the new sample name for that + * BAM or VCF file. The sample name may contain non-tab whitespace, but leading or trailing whitespace will be + * ignored. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only a single + * sample specified in its header (though, in the case of BAM files, there may be multiple read groups for that + * sample). + */ + @Advanced + @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false) + public File sampleRenameMappingFile = null; + + /** + * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. The one exception to this rule is ALLOW_N_CIGAR_READS, which is necessary for RNAseq analysis. + */ + @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) + public ValidationExclusion.TYPE unsafe; + /** + * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking + * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index + * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it + * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general + * because it allows reading from index files without first acquiring a lock. + */ + @Hidden + @Advanced + @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", + doc = "Disable both auto-generation of index files and index file locking", + required = false) + public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; + + @Hidden + @Argument(fullName = "no_cmdline_in_header", shortName = "no_cmdline_in_header", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", + required = false) + public boolean disableCommandLineInVCF = false; + + @Argument(fullName = "sites_only", shortName = "sites_only", doc = "Just output sites without genotypes (i.e. only the first 8 columns of the VCF)", + required = false) + public boolean sitesOnlyVCF = false; + + /** + *

The VCF specification permits missing records to be dropped from the end of FORMAT fields, so long as GT is always output. + * This option prevents GATK from performing that trimming.

+ * + *

For example, given a FORMAT of

GT:AD:DP:PL
, GATK will by default emit
./.
for a variant with + * no reads present (ie, the AD, DP, and PL fields are trimmed). If you specify -writeFullFormat, this record + * would be emitted as
./.:.:.:.

+ */ + @Argument(fullName = "never_trim_vcf_format_field", shortName = "writeFullFormat", doc = "Always output all the records in VCF FORMAT fields, even if some are missing", + required = false) + public boolean neverTrimVCFFormatField = false; + + @Hidden + @Argument(fullName = "bcf", shortName = "bcf", doc = "Force BCF output, regardless of the file's extension", + required = false) + public boolean forceBCFOutput = false; + + @Advanced + @Argument(fullName = "bam_compression", shortName = "compress", doc = "Compression level to use for writing BAM files (0 - 9, higher is more compressed)", + minValue = 0, maxValue = 9, required = false) + public Integer bamCompression = null; + + @Advanced + @Argument(fullName = "simplifyBAM", shortName = "simplifyBAM", + doc = "If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier", + required = false) + public boolean simplifyBAM = false; + + @Argument(fullName = "disable_bam_indexing", doc = "Turn off on-the-fly creation of indices for output BAM files.", + required = false) + public boolean disableBAMIndexing = false; + + @Argument(fullName = "generate_md5", doc = "Enable on-the-fly creation of md5s for output BAM files.", + required = false) + public boolean enableBAMmd5 = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // Multi-threading arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Data threads contains N cpu threads per data thread, and act as completely data parallel processing, increasing + * the memory usage of GATK by M data threads. Data threads generally scale extremely effectively, up to 24 cores. + * See online documentation FAQs for more information. + */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "Number of data threads to allocate to this analysis", required = false, minValue = 1) + public Integer numberOfDataThreads = 1; + + /** + * Each CPU thread operates the map cycle independently, but may run into earlier scaling problems with IO than + * data threads. Has the benefit of not requiring X times as much memory per thread as data threads do, but rather + * only a constant overhead. See online documentation FAQs for more information. + */ + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="Number of CPU threads to allocate per data thread", required = false, minValue = 1) + public int numberOfCPUThreadsPerDataThread = 1; + + @Argument(fullName="num_io_threads", shortName = "nit", doc="Number of given threads to allocate to IO", required = false, minValue = 0) + @Hidden + public int numberOfIOThreads = 0; + + /** + * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny + * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for + * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. + */ + @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable threading efficiency monitoring", required = false) + public Boolean monitorThreadEfficiency = false; + + @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="Total number of BAM file handles to keep open simultaneously", required=false, minValue = 1) + public Integer numberOfBAMFileHandles = null; + /** + * This will filter out read groups matching : (e.g. SM:sample1) or a .txt file containing the filter strings one per line. + */ + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Exclude read groups based on tags", required = false) + public List readGroupBlackList = null; + + // -------------------------------------------------------------------------------------------------------------- + // + // PED (pedigree) support + // + // -------------------------------------------------------------------------------------------------------------- + + /** + *

Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK.

+ * + * + * + *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

+ * + *
    + *
  • Family ID
  • + *
  • Individual ID
  • + *
  • Paternal ID
  • + *
  • Maternal ID
  • + *
  • Sex (1=male; 2=female; other=unknown)
  • + *
  • Phenotype
  • + *
+ * + *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: GATK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

+ * + *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

+ * + *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore.

+ * + *

Affection status should be coded:

+ * + *
    + *
  • -9 missing
  • + *
  • 0 missing
  • + *
  • 1 unaffected
  • + *
  • 2 affected
  • + *
+ * + *

If any value outside of -9,0,1,2 is detected than the samples are assumed + * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely + * represents the missing value.

+ * + *

Genotypes (column 7 onwards) cannot be specified to the GATK.

+ * + *

For example, here are two individuals (one row = one person):

+ * + *
+     *   FAM001  1  0 0  1  2
+     *   FAM001  2  0 0  1  2
+     * 
+ * + *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file.

+ * + *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + * data should clearly indicate so in their arguments and will throw errors if required pedigree + * information is missing.

+ */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); + + /** + * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more + * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString + * as -ped supports + */ + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); + + /** + * How strict should we be in parsing the PED files? + */ + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="Validation strictness for pedigree information",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- + /** + * NO INTEGRATION TESTS are available. Use at your own risk. + */ + @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM",required=false) + @Hidden + public boolean allowIntervalsWithUnindexedBAM = false; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing BCF2 + // + // -------------------------------------------------------------------------------------------------------------- + /** + * If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes. + */ + @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="Write a BCF copy of the output VCF",required=false) + @Hidden + public boolean generateShadowBCF = false; + // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + + // -------------------------------------------------------------------------------------------------------------- + // + // VCF/BCF index parameters + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Specify the Tribble indexing strategy to use for VCFs. + * + * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter + * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter + * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) + */ + @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false) + @Advanced + public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; + /** + * This is either the bin width or the number of features per bin, depending on the indexing strategy + */ + @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false) + @Advanced + public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; +} + diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/StandardVariantContextInputArgumentCollection.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/CryptUtils.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/CryptUtils.java new file mode 100644 index 000000000..cbbbe47e1 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/CryptUtils.java @@ -0,0 +1,391 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.io.IOUtils; + +import javax.crypto.Cipher; +import java.io.File; +import java.io.InputStream; +import java.security.*; +import java.security.spec.InvalidKeySpecException; +import java.security.spec.KeySpec; +import java.security.spec.PKCS8EncodedKeySpec; +import java.security.spec.X509EncodedKeySpec; +import java.util.Arrays; + +/** + * A set of cryptographic utility methods and constants. + * + * Contains methods to: + * + * -Create a public/private key pair + * -Read and write public/private keys to/from files/streams + * -Load the GATK master private/public keys + * -Encrypt/decrypt data + * + * Also contains constants that control the cryptographic defaults + * throughout the GATK. + * + * @author David Roazen + */ +public class CryptUtils { + + // --------------------------------------------------------------------------------- + // Constants (these control the default cryptographic settings throughout the GATK): + // --------------------------------------------------------------------------------- + + /** + * Default key length in bits of newly-created keys. 2048 bits provides a good balance between + * security and speed. + */ + public static final int DEFAULT_KEY_LENGTH = 2048; + + /** + * Default encryption algorithm to use, when none is specified. + */ + public static final String DEFAULT_ENCRYPTION_ALGORITHM = "RSA"; + + /** + * Default random-number generation algorithm to use, when none is specified. + */ + public static final String DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM = "SHA1PRNG"; + + /** + * Name of the public key file distributed with the GATK. This file is packaged + * into the GATK jar, and we use the system ClassLoader to find it. + */ + public static final String GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME = "GATK_public.key"; + + /** + * Location of the master copy of the GATK private key. + */ + public static final String GATK_MASTER_PRIVATE_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_private.key"; + + /** + * Location of the master copy of the GATK public key. This file should always be the same as + * the public key file distributed with the GATK (and there are automated tests to ensure that it is). + */ + public static final String GATK_MASTER_PUBLIC_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_public.key"; + + /** + * Directory where generated GATK user keys are stored. See the GATKKey class for more information. + */ + public static final String GATK_USER_KEY_DIRECTORY = "/humgen/gsa-hpprojects/GATK/data/gatk_user_keys/"; + + + // ----------------------- + // Utility Methods: + // ----------------------- + + /** + * Generate a new public/private key pair using the default encryption settings defined above. + * + * @return A new public/private key pair created using the default settings + */ + public static KeyPair generateKeyPair() { + return generateKeyPair(DEFAULT_KEY_LENGTH, DEFAULT_ENCRYPTION_ALGORITHM, DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Generate a new public/private key pair using custom encryption settings. + * + * @param keyLength Length of the key in bits + * @param encryptionAlgorithm Encryption algorithm to use + * @param randNumberAlgorithm Random-number generation algorithm to use + * @return A new public/private key pair, created according to the specified parameters + */ + public static KeyPair generateKeyPair( int keyLength, String encryptionAlgorithm, String randNumberAlgorithm ) { + try { + KeyPairGenerator keyGen = KeyPairGenerator.getInstance(encryptionAlgorithm); + SecureRandom randomnessSource = createRandomnessSource(randNumberAlgorithm); + + keyGen.initialize(keyLength, randomnessSource); + return keyGen.generateKeyPair(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( Exception e ) { + throw new ReviewedGATKException("Error while generating key pair", e); + } + } + + /** + * Create a source of randomness using the default random-number generation algorithm. + * + * @return A randomness source that uses the default algorithm + */ + public static SecureRandom createRandomnessSource() { + return createRandomnessSource(DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Create a source of randomness using a custom random-number generation algorithm. + * + * @param randAlgorithm The random-number generation algorithm to use + * @return A randomness sources that uses the specified algorithm + */ + public static SecureRandom createRandomnessSource ( String randAlgorithm ) { + try { + return SecureRandom.getInstance(randAlgorithm); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested random-number generation algorithm %s", randAlgorithm), e); + } + } + + /** + * Writes a public/private key pair to disk + * + * @param keyPair The key pair we're writing to disk + * @param privateKeyFile Location to write the private key + * @param publicKeyFile Location to write the public key + */ + public static void writeKeyPair ( KeyPair keyPair, File privateKeyFile, File publicKeyFile ) { + writeKey(keyPair.getPrivate(), privateKeyFile); + writeKey(keyPair.getPublic(), publicKeyFile); + } + + /** + * Writes an arbitrary key to disk + * + * @param key The key to write + * @param destination Location to write the key to + */ + public static void writeKey ( Key key, File destination ) { + IOUtils.writeByteArrayToFile(key.getEncoded(), destination); + } + + /** + * Reads in a public key created using the default encryption algorithm from a file. + * + * @param source File containing the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( File source ) { + return decodePublicKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a public key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( InputStream source ) { + return decodePublicKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a public key into a usable object. + * + * @param rawKey The encoded bytes of a public key as read from, eg., a file. The + * key must be in the standard X.509 format for a public key. + * @param encryptionAlgorithm The encryption algorithm used to create the public key + * @return The public key as a usable object + */ + public static PublicKey decodePublicKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new X509EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePublic(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedGATKException("Unable to use X.509 key specification to decode the given key", e); + } + } + + /** + * Reads in a private key created using the default encryption algorithm from a file. + * + * @param source File containing the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( File source ) { + return decodePrivateKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a private key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( InputStream source ) { + return decodePrivateKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a private key into a usable object. + * + * @param rawKey The encoded bytes of a private key as read from, eg., a file. The + * key must be in the standard PKCS #8 format for a private key. + * @param encryptionAlgorithm The encryption algorithm used to create the private key + * @return The private key as a usable object + */ + public static PrivateKey decodePrivateKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new PKCS8EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePrivate(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedGATKException("Unable to use the PKCS #8 key specification to decode the given key", e); + } + } + + /** + * Loads the copy of the GATK public key that is distributed with the GATK. Uses the system + * ClassLoader to locate the public key file, which should be stored at the root of the GATK + * jar file. + * + * @return The GATK public key as a usable object + */ + public static PublicKey loadGATKDistributedPublicKey() { + InputStream publicKeyInputStream = ClassLoader.getSystemResourceAsStream(GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME); + + if ( publicKeyInputStream == null ) { + throw new ReviewedGATKException(String.format("Could not locate the GATK public key %s in the classpath", + GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME)); + } + + return readPublicKey(publicKeyInputStream); + } + + /** + * Loads the master copy of the GATK private key. You must have the appropriate UNIX permissions + * to do this! + * + * @return The GATK master private key as a usable object + */ + public static PrivateKey loadGATKMasterPrivateKey() { + return readPrivateKey(new File(GATK_MASTER_PRIVATE_KEY_FILE)); + } + + /** + * Loads the master copy of the GATK public key. This should always be the same as the + * public key distributed with the GATK returned by loadGATKDistributedPublicKey(). + * + * @return The GATK master public key as a usable object + */ + public static PublicKey loadGATKMasterPublicKey() { + return readPublicKey(new File(GATK_MASTER_PUBLIC_KEY_FILE)); + } + + /** + * Encrypts the given data using the key provided. + * + * @param data The data to encrypt, as a byte array + * @param encryptKey The key with which to encrypt the data + * @return The encrypted version of the provided data + */ + public static byte[] encryptData ( byte[] data, Key encryptKey ) { + return transformDataUsingCipher(data, encryptKey, Cipher.ENCRYPT_MODE); + } + + /** + * Decrypts the given data using the key provided. + * + * @param encryptedData Data to decrypt, as a byte array + * @param decryptKey The key with which to decrypt the data + * @return The decrypted version of the provided data + */ + public static byte[] decryptData ( byte[] encryptedData, Key decryptKey ) { + return transformDataUsingCipher(encryptedData, decryptKey, Cipher.DECRYPT_MODE); + } + + /** + * Helper method for encryption/decryption that takes data and processes it using + * the given key + * + * @param data Data to encrypt/decrypt + * @param key Key to use to encrypt/decrypt the data + * @param cipherMode Specifies whether we are encrypting or decrypting + * @return The encrypted/decrypted data + */ + private static byte[] transformDataUsingCipher ( byte[] data, Key key, int cipherMode ) { + try { + Cipher cipher = Cipher.getInstance(key.getAlgorithm()); + cipher.init(cipherMode, key); + return cipher.doFinal(data); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Could not find an implementation of the requested algorithm %s", + key.getAlgorithm()), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedGATKException("Key is invalid", e); + } + catch ( GeneralSecurityException e ) { + throw new ReviewedGATKException("Error during encryption", e); + } + } + + /** + * Tests whether the public/private keys provided can each decrypt data encrypted by + * the other key -- ie., tests whether these two keys are part of the same public/private + * key pair. + * + * @param privateKey The private key to test + * @param publicKey The public key to test + * @return True if the keys are part of the same key pair and can decrypt each other's + * encrypted data, otherwise false. + */ + public static boolean keysDecryptEachOther ( PrivateKey privateKey, PublicKey publicKey ) { + byte[] plainText = "Test PlainText".getBytes(); + + byte[] dataEncryptedUsingPrivateKey = CryptUtils.encryptData(plainText, privateKey); + byte[] dataEncryptedUsingPublicKey = CryptUtils.encryptData(plainText, publicKey); + + byte[] privateKeyDataDecryptedWithPublicKey = CryptUtils.decryptData(dataEncryptedUsingPrivateKey, publicKey); + byte[] publicKeyDataDecryptedWithPrivateKey = CryptUtils.decryptData(dataEncryptedUsingPublicKey, privateKey); + + // Make sure we actually transformed the data during encryption: + if ( Arrays.equals(plainText, dataEncryptedUsingPrivateKey) || + Arrays.equals(plainText, dataEncryptedUsingPublicKey) || + Arrays.equals(dataEncryptedUsingPrivateKey, dataEncryptedUsingPublicKey) ) { + return false; + } + + // Make sure that we were able to recreate the original plaintext using + // both the public key on the private-key-encrypted data and the private + // key on the public-key-encrypted data: + if ( ! Arrays.equals(plainText, privateKeyDataDecryptedWithPublicKey) || + ! Arrays.equals(plainText, publicKeyDataDecryptedWithPrivateKey) ) { + return false; + } + + return true; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/GATKKey.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/GATKKey.java new file mode 100644 index 000000000..42a88b9d0 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/crypt/GATKKey.java @@ -0,0 +1,350 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.io.IOUtils; + +import java.io.*; +import java.security.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +/** + * Class to represent a GATK user key. + * + * A GATK user key contains an email address and a cryptographic signature. + * The signature is the SHA-1 hash of the email address encrypted using + * the GATK master private key. The GATK master public key (distributed + * with the GATK) is used to decrypt the signature and validate the key + * at the start of each GATK run that requires a key. + * + * Keys are cryptographically secure in that valid keys definitely come + * from us and cannot be fabricated, however nothing prevents keys from + * being shared between users. + * + * GATK user keys have the following on-disk format: + * + * GZIP Container: + * Email address + * NUL byte (delimiter) + * Cryptographic Signature (encrypted SHA-1 hash of email address) + * + * The key data is wrapped within a GZIP container to placate over-zealous + * email filters (since keys must often be emailed) and also to provide an + * additional integrity check via the built-in GZIP CRC. + * + * @author David Roazen + */ +public class GATKKey { + + /** + * Private key used to sign the GATK key. Required only when creating a new + * key from scratch, not when loading an existing key from disk. + */ + private PrivateKey privateKey; + + /** + * Public key used to validate the GATK key. + */ + private PublicKey publicKey; + + /** + * The user's email address, stored within the key and signed. + */ + private String emailAddress; + + /** + * The cryptographic signature of the email address. By default, this is + * the SHA-1 hash of the email address encrypted using the RSA algorithm. + */ + private byte[] signature; + + /** + * The combination of hash/encryption algorithms to use to generate the signature. + * By default this is "SHA1withRSA" + */ + private String signingAlgorithm; + + /** + * Default hash/encryption algorithms to use to sign the key. + */ + public static final String DEFAULT_SIGNING_ALGORITHM = "SHA1withRSA"; + + /** + * Byte value used to separate the email address from its signature in the key file. + */ + public static final byte GATK_KEY_SECTIONAL_DELIMITER = 0; + + + // ----------------------- + // Constructors: + // ----------------------- + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair. The private key is used for signing, and the + * public key is used to validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress ) { + this(privateKey, publicKey, emailAddress, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair, and additionally specify the signing algorithm + * to use. The private key is used for signing, and the public key is used to + * validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + * @param signingAlgorithm The combination of hash and encryption algorithms to use to sign the key + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress, String signingAlgorithm ) { + if ( privateKey == null || publicKey == null || emailAddress == null || emailAddress.length() == 0 || signingAlgorithm == null ) { + throw new ReviewedGATKException("Cannot construct GATKKey using null/empty arguments"); + } + + this.privateKey = privateKey; + this.publicKey = publicKey; + this.emailAddress = emailAddress; + this.signingAlgorithm = signingAlgorithm; + + validateEmailAddress(); + generateSignature(); + + if ( ! isValid() ) { + throw new ReviewedGATKException("Newly-generated GATK key fails validation -- this should never happen!"); + } + } + + /** + * Constructor to load an existing GATK key from a file. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + */ + public GATKKey ( PublicKey publicKey, File keyFile ) { + this(publicKey, keyFile, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to load an existing GATK key from a file, and additionally specify + * the signing algorithm used to sign the key being loaded. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + * @param signingAlgorithm The combination of hash and encryption algorithms used to sign the key + */ + public GATKKey ( PublicKey publicKey, File keyFile, String signingAlgorithm ) { + if ( publicKey == null || keyFile == null || signingAlgorithm == null ) { + throw new ReviewedGATKException("Cannot construct GATKKey using null arguments"); + } + + this.publicKey = publicKey; + this.signingAlgorithm = signingAlgorithm; + + readKey(keyFile); + } + + // ----------------------- + // Public API Methods: + // ----------------------- + + /** + * Writes out this key to a file in the format described at the top of this class, + * encapsulating the key within a GZIP container. + * + * @param destination File to write the key to + */ + public void writeKey ( File destination ) { + try { + byte[] keyBytes = marshalKeyData(); + IOUtils.writeByteArrayToStream(keyBytes, new GZIPOutputStream(new FileOutputStream(destination))); + } + catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(destination, e); + } + } + + /** + * Checks whether the signature of this key is cryptographically valid (ie., can be + * decrypted by the public key to produce a valid SHA-1 hash of the email address + * in the key). + * + * @return True if the key's signature passes validation, otherwise false + */ + public boolean isValid() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initVerify(publicKey); + sig.update(emailAddress.getBytes()); + return sig.verify(signature); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + // If the GATK public key is invalid, it's likely our problem, not the user's: + throw new ReviewedGATKException(String.format("Public key %s is invalid", publicKey), e); + } + catch ( SignatureException e ) { + throw new UserException.UnreadableKeyException("Signature is invalid or signing algorithm was unable to process the input data", e); + } + } + + // ----------------------- + // Private Helper Methods: + // ----------------------- + + /** + * Helper method that creates a signature for this key using the combination of + * hash/encryption algorithms specified at construction time. + */ + private void generateSignature() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initSign(privateKey, CryptUtils.createRandomnessSource()); + sig.update(emailAddress.getBytes()); + signature = sig.sign(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedGATKException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedGATKException(String.format("Private key %s is invalid", privateKey), e); + } + catch ( SignatureException e ) { + throw new ReviewedGATKException(String.format("Error creating signature for email address %s", emailAddress), e); + } + } + + /** + * Helper method that reads in a GATK key from a file. Should not be called directly -- + * use the appropriate constructor above. + * + * @param source File to read the key from + */ + private void readKey ( File source ) { + try { + byte[] keyBytes = IOUtils.readStreamIntoByteArray(new GZIPInputStream(new FileInputStream(source))); + + // As a sanity check, compare the number of bytes read to the uncompressed file size + // stored in the GZIP ISIZE field. If they don't match, the key must be corrupt: + if ( keyBytes.length != IOUtils.getGZIPFileUncompressedSize(source) ) { + throw new UserException.UnreadableKeyException("Number of bytes read does not match the uncompressed size specified in the GZIP ISIZE field"); + } + + unmarshalKeyData(keyBytes); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(source, e); + } + catch ( IOException e ) { + throw new UserException.UnreadableKeyException(source, e); + } + catch ( UserException.CouldNotReadInputFile e ) { + throw new UserException.UnreadableKeyException(source, e); + } + } + + /** + * Helper method that assembles the email address and signature into a format + * suitable for writing to disk. + * + * @return The aggregated key data, ready to be written to disk + */ + private byte[] marshalKeyData() { + byte[] emailAddressBytes = emailAddress.getBytes(); + byte[] assembledKey = new byte[emailAddressBytes.length + 1 + signature.length]; + + System.arraycopy(emailAddressBytes, 0, assembledKey, 0, emailAddressBytes.length); + assembledKey[emailAddressBytes.length] = GATK_KEY_SECTIONAL_DELIMITER; + System.arraycopy(signature, 0, assembledKey, emailAddressBytes.length + 1, signature.length); + + return assembledKey; + } + + /** + * Helper method that parses the raw key data from disk into its component + * email address and signature. Performs some basic validation in the process. + * + * @param keyBytes The raw, uncompressed key data read from disk + */ + private void unmarshalKeyData ( byte[] keyBytes ) { + int delimiterPosition = -1; + + for ( int i = 0; i < keyBytes.length; i++ ) { + if ( keyBytes[i] == GATK_KEY_SECTIONAL_DELIMITER ) { + delimiterPosition = i; + break; + } + } + + if ( delimiterPosition == -1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no sectional delimiter"); + } + else if ( delimiterPosition == 0 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no email address"); + } + else if ( delimiterPosition == keyBytes.length - 1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no signature"); + } + + byte[] emailAddressBytes = new byte[delimiterPosition]; + System.arraycopy(keyBytes, 0, emailAddressBytes, 0, delimiterPosition); + emailAddress = new String(emailAddressBytes); + + signature = new byte[keyBytes.length - delimiterPosition - 1]; + System.arraycopy(keyBytes, delimiterPosition + 1, signature, 0, keyBytes.length - delimiterPosition - 1); + } + + /** + * Helper method that ensures that the user's email address does not contain the NUL byte, which we + * reserve as a delimiter within each key file. + */ + private void validateEmailAddress() { + for ( byte b : emailAddress.getBytes() ) { + if ( b == GATK_KEY_SECTIONAL_DELIMITER ) { + throw new UserException(String.format("Email address must not contain a byte with value %d", GATK_KEY_SECTIONAL_DELIMITER)); + } + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java new file mode 100644 index 000000000..4bcecbcad --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java @@ -0,0 +1,169 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.iterators.GenomeLocusIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.Collections; +import java.util.List; +import java.util.NoSuchElementException; +/** + * User: hanna + * Date: May 13, 2009 + * Time: 3:32:30 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A LocusView over which the user can iterate. + */ + +public class AllLocusView extends LocusView { + private GenomeLocusIterator locusIterator; + + /** + * Gets the next position in the view: next call to next() will jump there. + * Note that both nextPosition and nextLocus are PRE-read and cached. + */ + private GenomeLoc nextPosition = null; + + /** + * What's the next available context? + */ + private AlignmentContext nextLocus = null; + + /** + * Signal not to advance the iterator because we're currently sitting at the next element. + */ + private boolean atNextElement = false; + + /** + * Create a new queue of locus contexts. + * + * @param provider + */ + public AllLocusView(LocusShardDataProvider provider) { + super(provider); + // Seed the state tracking members with the first possible seek position and the first possible locus context. + locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus()); + } + + public boolean hasNext() { + advance(); + return nextPosition != null; + } + + public AlignmentContext next() { + advance(); + + if (nextPosition == null) + throw new NoSuchElementException("No next is available in the all locus view"); + + // Flag to the iterator that no data is waiting in the queue to be processed. + atNextElement = false; + + AlignmentContext currentLocus; + + // If actual data is present, return it. Otherwise, return empty data. + if (nextLocus != null && nextLocus.getLocation().equals(nextPosition)) + currentLocus = nextLocus; + else + currentLocus = createEmptyLocus(nextPosition); + + return currentLocus; + } + + private void advance() { + // Already at the next element? Don't move forward. + if (atNextElement) + return; + + // Out of elements? + if (nextPosition == null && !locusIterator.hasNext()) + return; + + // If nextLocus has been consumed, clear it out to make room for the next incoming locus. + if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { + nextLocus = null; + + // Determine the next locus. The trick is that we may have more than one alignment context at the same + // reference position (regular base pileup, then extended pileup). If next alignment context (that we just pre-read) + // is still at the current position, we do not increment current position and wait for next call to next() to return + // that context. If we know that next context is past the current position, we are done with current + // position + if (hasNextLocus()) { + nextLocus = nextLocus(); + if (nextPosition.equals(nextLocus.getLocation())) { + atNextElement = true; + return; + } + } + } + + // No elements left in queue? Clear out the position state tracker and return. + if (!locusIterator.hasNext()) { + nextPosition = null; + return; + } + + // Actually fill the next position. + nextPosition = locusIterator.next(); + atNextElement = true; + + // Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus + // while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal. + while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { + nextLocus = null; + if (!hasNextLocus()) + break; + nextLocus = nextLocus(); + } + } + + /** + * Creates a blank locus context at the specified location. + * + * @param site Site at which to create the blank locus context. + * @return empty context. + */ + private final static List EMPTY_PILEUP_READS = Collections.emptyList(); + private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList(); + private final static List EMPTY_DELETION_STATUS = Collections.emptyList(); + + private AlignmentContext createEmptyLocus(GenomeLoc site) { + return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java new file mode 100644 index 000000000..777e23cb8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 11:24:42 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A queue of locus contexts. Provides unidirectional seek. Stripped down + * implementation of java.util.Queue interface. + */ + +public class CoveredLocusView extends LocusView { + /** + * Create a new queue of locus contexts. + * @param provider + */ + public CoveredLocusView(LocusShardDataProvider provider) { + super(provider); + } + + public boolean hasNext() { + return hasNextLocus(); + } + + public AlignmentContext next() { + return nextLocus(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java new file mode 100644 index 000000000..1525c381a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java @@ -0,0 +1,168 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.util.PeekableIterator; +import org.broadinstitute.gatk.utils.refdata.RODRecordListImpl; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.ListIterator; + +/** + * Key algorithmic helper for ReadBasedReferenceOrderedData + * + * Takes a single iterator of features, and provides a single capability that returns + * the list of RODs that overlap an interval. Allows sequential getOverlapping calls + * from intervals provided that these intervals always have increasing getStart() values. + * + */ +class IntervalOverlappingRODsFromStream { + /** + * Only held for QC purposes + */ + GenomeLoc lastQuery = null; + + private final String name; + private final LinkedList currentFeatures = new LinkedList(); + private final PeekableIterator futureFeatures; + + /** + * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and + * returns RODRecordLists having name + * + * @param name + * @param futureFeatures + */ + IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { + if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); + + this.name = name; + this.futureFeatures = futureFeatures; + } + + /** + * Get the list of RODs overlapping loc from this stream of RODs. + * + * @param loc the interval to query + * @return a non-null RODRecordList containing the overlapping RODs, which may be empty + */ + @Ensures({"overlaps(loc, result)", + "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", + "result != null"}) + public RODRecordList getOverlapping(final GenomeLoc loc) { + if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) + throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); + + readOverlappingFutureFeatures(loc); + return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); + } + + + /** + * For contract assurance. Checks that all bindings in loc overlap + * + * @param loc + * @param bindings + * @return + */ + @Requires({"loc != null", "bindings != null"}) + private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { + for ( final GATKFeature feature : bindings ) + if ( ! feature.getLocation().overlapsP(loc) ) + return false; + return true; + } + + /** + * Subset the features in all to those that overlap with loc + * + * The current features list contains everything read that cannot be thrown away yet, but not + * everything in there necessarily overlaps with loc. Subset to just those that do overlap + * + * @param loc the location that features must overlap + * @param all the list of all features + * @return a subset of all that overlaps with loc + */ + @Requires({"loc != null", "all != null"}) + @Ensures("result.size() <= all.size()") + private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { + final LinkedList overlapping = new LinkedList(); + for ( final GATKFeature feature : all ) + if ( feature.getLocation().overlapsP(loc) ) + overlapping.add(feature); + return overlapping; + } + + /** + * Update function. Remove all elements of currentFeatures that end before loc + * + * Must be called by clients periodically when they know they they will never ask for data before + * loc, so that the running cache of RODs doesn't grow out of control. + * + * @param loc the location to use + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() <= old(currentFeatures.size())") + public void trimCurrentFeaturesToLoc(final GenomeLoc loc) { + final ListIterator it = currentFeatures.listIterator(); + while ( it.hasNext() ) { + final GATKFeature feature = it.next(); + if ( feature.getLocation().isBefore(loc) ) + it.remove(); + } + } + + /** + * Update function: Read all elements from futureFeatures that overlap with loc + * + * Stops at the first element that starts before the end of loc, or the stream empties + * + * @param loc + */ + @Requires("loc != null") + @Ensures("currentFeatures.size() >= old(currentFeatures.size())") + private void readOverlappingFutureFeatures(final GenomeLoc loc) { + while ( futureFeatures.hasNext() ) { + final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); + if ( nextLoc.isBefore(loc) ) { + futureFeatures.next(); // next rod element is before loc, throw it away and keep looking + } else if ( nextLoc.isPast(loc) ) { + break; // next element is past loc, stop looking but don't pop it + } else if ( nextLoc.overlapsP(loc) ) { + // add overlapping elements to our current features, removing from stream + for ( final GATKFeature feature : futureFeatures.next() ) { + currentFeatures.add(feature); + } + } + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java new file mode 100644 index 000000000..4dfc31d86 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java @@ -0,0 +1,182 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.util.PeekableIterator; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** + * a ROD view that allows for requests for RODs that overlap intervals on the genome to produce a RefMetaDataTracker + */ +public class IntervalReferenceOrderedView implements ReferenceOrderedView { + /** a list of the RMDDataState (location->iterators) */ + private final List states = new ArrayList<>(1); + + /** + * Used to get genome locs for reads + */ + protected final GenomeLocParser genomeLocParser; + + /** + * The total extent of all reads in this span. We create iterators from our RODs + * from the start of this span, to the end. + */ + private final GenomeLoc shardSpan; + + /** + * Create a new IntervalReferenceOrderedView taking data from provider and capable of + * servicing ROD overlap requests within the genomic interval span + * + * @param provider a ShardDataProvider to give us data + * @param span a GenomeLoc span, or null indicating take the entire genome + */ + public IntervalReferenceOrderedView(final ShardDataProvider provider, final GenomeLoc span) { + if ( provider == null ) throw new IllegalArgumentException("provider cannot be null"); + if ( provider.hasReferenceOrderedData() && span == null ) throw new IllegalArgumentException("span cannot be null when provider has reference ordered data"); + + this.genomeLocParser = provider.getGenomeLocParser(); + this.shardSpan = span; + provider.register(this); + + // conditional to optimize the case where we don't have any ROD data + if ( provider.hasReferenceOrderedData() && ! shardSpan.isUnmapped() ) { + for (final ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) + states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); + } + } + + /** + * Testing constructor + */ + protected IntervalReferenceOrderedView(final GenomeLocParser genomeLocParser, + final GenomeLoc shardSpan, + final List names, + final List> featureSources) { + this.genomeLocParser = genomeLocParser; + this.shardSpan = shardSpan; + for ( int i = 0; i < names.size(); i++ ) + states.add(new RMDDataState(names.get(i), featureSources.get(i))); + } + + public Collection> getConflictingViews() { + List> classes = new ArrayList<>(); + classes.add(ManagingReferenceOrderedView.class); + return classes; + } + + /** + * Get a RefMetaDataTracker containing bindings for all RODs overlapping the start position of loc + * @param loc a GenomeLoc of size == 1 + * @return a non-null RefMetaDataTracker + */ + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus(GenomeLoc loc) { + if ( loc == null ) throw new IllegalArgumentException("loc cannot be null"); + if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); + return getReferenceOrderedDataForInterval(loc); + } + + /** + * Get a RefMetaDataTracker containing bindings for all RODs overlapping interval + * + * @param interval a non=null interval + * @return a non-null RefMetaDataTracker + */ + public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { + if ( interval == null ) throw new IllegalArgumentException("Interval cannot be null"); + + if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + final List bindings = new ArrayList<>(states.size()); + for ( final RMDDataState state : states ) + bindings.add(state.stream.getOverlapping(interval)); + return new RefMetaDataTracker(bindings); + } + } + + /** + * Trim down all of the ROD managers so that they only hold ROD bindings wit start >= startOfDataToKeep.getStart() + * + * @param startOfDataToKeep a non-null genome loc + */ + public void trimCurrentFeaturesToLoc(final GenomeLoc startOfDataToKeep) { + if ( startOfDataToKeep == null ) throw new IllegalArgumentException("startOfDataToKeep cannot be null"); + + for ( final RMDDataState state : states ) + state.stream.trimCurrentFeaturesToLoc(startOfDataToKeep); + } + + /** + * Closes the current view. + */ + public void close() { + for (final RMDDataState state : states) + state.close(); + + // Clear out the existing data so that post-close() accesses to this data will fail-fast. + states.clear(); + } + + /** + * Models the traversal state of a given ROD lane. + */ + private static class RMDDataState { + public final ReferenceOrderedDataSource dataSource; + public final IntervalOverlappingRODsFromStream stream; + private final LocationAwareSeekableRODIterator iterator; + + public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { + this.dataSource = dataSource; + this.iterator = iterator; + this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<>(iterator)); + } + + /** + * For testing + */ + public RMDDataState(final String name, final PeekableIterator iterator) { + this.dataSource = null; + this.iterator = null; + this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<>(iterator)); + } + + public void close() { + if ( dataSource != null ) + dataSource.close( iterator ); + } + } +} + diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/InvalidPositionException.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java new file mode 100644 index 000000000..d4278c9b2 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java @@ -0,0 +1,236 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.reference.ReferenceSequence; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.walkers.Reference; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.engine.walkers.Window; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * Provides access to the portion of the reference covering a single locus. + */ +public class LocusReferenceView extends ReferenceView { + /** + * Bound the reference view to make sure all accesses are within the shard. + */ + private GenomeLoc bounds; + + /** + * Start of the expanded window for which the reference context should be provided, + * relative to the locus in question. + */ + private final int windowStart; + + + /** + * Start of the expanded window for which the reference context should be provided, + * relative to the locus in question. + */ + private final int windowStop; + + /** + * Track the reference sequence and the last point accessed. Used to + * track state when traversing over the reference. + */ + private ReferenceSequence referenceSequence; + + /** + * Create a LocusReferenceView given no other contextual information about + * the walkers, etc. + * @param provider source for locus data. + */ + public LocusReferenceView( LocusShardDataProvider provider ) { + super(provider); + initializeBounds(provider); + windowStart = windowStop = 0; + initializeReferenceSequence(bounds); + } + + /** + * Create a new locus reference view. + * @param provider source for locus data. + */ + public LocusReferenceView( Walker walker, LocusShardDataProvider provider ) { + super( provider ); + initializeBounds(provider); + + // Retrieve information about the window being accessed. + if( walker.getClass().isAnnotationPresent(Reference.class) ) { + Window window = walker.getClass().getAnnotation(Reference.class).window(); + + if( window.start() > 0 ) throw new ReviewedGATKException( "Reference window starts after current locus" ); + if( window.stop() < 0 ) throw new ReviewedGATKException( "Reference window ends before current locus" ); + + windowStart = window.start(); + windowStop = window.stop(); + } + else { + windowStart = 0; + windowStop = 0; + } + + if(bounds != null) { + int expandedStart = getWindowStart( bounds ); + int expandedStop = getWindowStop( bounds ); + initializeReferenceSequence(genomeLocParser.createGenomeLoc(bounds.getContig(), bounds.getContigIndex(), expandedStart, expandedStop)); + } + } + + /** + * Initialize the bounds of this shard, trimming the bounds so that they match the reference. + * @param provider Provider covering the appropriate locus. + */ + private void initializeBounds(LocusShardDataProvider provider) { + if(provider.getLocus() != null) { + int sequenceLength = reference.getSequenceDictionary().getSequence(provider.getLocus().getContig()).getSequenceLength(); + bounds = genomeLocParser.createGenomeLoc(provider.getLocus().getContig(), + Math.max(provider.getLocus().getStart(),1), + Math.min(provider.getLocus().getStop(),sequenceLength)); + } + else + bounds = null; + } + + /** + * Initialize reference sequence data using the given locus. + * @param locus + */ + private void initializeReferenceSequence( GenomeLoc locus ) { + this.referenceSequence = reference.getSubsequenceAt( locus.getContig(), locus.getStart(), locus.getStop() ); + } + + protected GenomeLoc trimToBounds(GenomeLoc l) { + int expandedStart = getWindowStart( bounds ); + int expandedStop = getWindowStop( bounds ); + if ( l.getStart() < expandedStart ) l = genomeLocParser.setStart(l, expandedStart); + if ( l.getStop() > expandedStop ) l = genomeLocParser.setStop(l, expandedStop); + return l; + } + + public class Provider implements ReferenceContext.ReferenceContextRefProvider { + int refStart, len; + + public Provider( int refStart, int len ) { + this.refStart = refStart; + this.len = len; + } + + public byte[] getBases() { + //System.out.printf("Getting bases for location%n"); + byte[] bases = new byte[len]; + System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len); + return bases; + } + } + + /** + * Gets the reference context associated with this particular point or extended interval on the genome. + * @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beyond current bounds, it will be trimmed down. + * @return The base at the position represented by this genomeLoc. + */ + public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) { + //validateLocation( genomeLoc ); + + GenomeLoc window = genomeLocParser.createGenomeLoc( genomeLoc.getContig(), genomeLoc.getContigIndex(), + getWindowStart(genomeLoc), getWindowStop(genomeLoc) ); + + int refStart = -1; + if (bounds != null) { + window = trimToBounds(window); + refStart = (int)(window.getStart() - getWindowStart(bounds)); + } + else { + if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex()) + referenceSequence = reference.getSequence(genomeLoc.getContig()); + refStart = (int)window.getStart()-1; + } + + int len = (int)window.size(); + return new ReferenceContext( genomeLocParser, genomeLoc, window, new Provider(refStart, len)); + } + + /** + * Allow the user to pull reference info from any arbitrary region of the reference. + * @param genomeLoc The locus. + * @return A list of the bases starting at the start of the locus (inclusive) and ending + * at the end of the locus (inclusive). + */ + public byte[] getReferenceBases( GenomeLoc genomeLoc ) { + return super.getReferenceBases(genomeLoc); + } + + /** + * Gets the start of the expanded window, bounded if necessary by the contig. + * @param locus The locus to expand. + * @return The expanded window. + */ + private int getWindowStart( GenomeLoc locus ) { + // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. + if(locus.getStart() < 1) return 1; +// if(locus.getStart() < 1) return locus.getStart(); + return Math.max( locus.getStart() + windowStart, 1 ); + } + + /** + * Gets the stop of the expanded window, bounded if necessary by the contig. + * @param locus The locus to expand. + * @return The expanded window. + */ + private int getWindowStop( GenomeLoc locus ) { + // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. + int sequenceLength = reference.getSequenceDictionary().getSequence(locus.getContig()).getSequenceLength(); + if(locus.getStop() > sequenceLength) return sequenceLength; + return Math.min( locus.getStop() + windowStop, sequenceLength ); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusShardDataProvider.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java new file mode 100644 index 000000000..11437cf2c --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java @@ -0,0 +1,219 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; + +import java.util.Arrays; +import java.util.Collection; +import java.util.NoSuchElementException; + +/** + * User: hanna + * Date: May 13, 2009 + * Time: 3:30:16 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * The two goals of the LocusView are as follows: + * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch + * between iterating over all bases in a region, only covered bases in a region covered by + * reads, only bases in a region covered by RODs, or any other sort of trigger track + * implementation one can think of. + * 2) To manage the copious number of iterators that have to be jointly pulled through the + * genome to make a locus traversal function. + */ +public abstract class LocusView extends LocusIterator implements View { + /** + * The locus bounding this view. + */ + protected GenomeLoc locus; + + /** + * The GenomeLocParser, used to create new genome locs. + */ + protected GenomeLocParser genomeLocParser; + + /** + * Source info for this view. Informs the class about downsampling requirements. + */ + private ReadProperties sourceInfo; + + /** + * The actual locus context iterator. + */ + private LocusIterator loci; + + /** + * The next locus context from the iterator. Lazy loaded: if nextLocus is null and advance() doesn't + * populate it, the iterator is exhausted. If populated, this is the value that should be returned by + * next(). + */ + private AlignmentContext nextLocus = null; + + public LocusView(LocusShardDataProvider provider) { + this.locus = provider.getLocus(); + + this.sourceInfo = provider.getSourceInfo(); + this.genomeLocParser = provider.getGenomeLocParser(); + this.loci = provider.getLocusIterator(); + + advance(); + + provider.register(this); + } + + /** + * Only one view of the locus is supported at any given time. + * @return A list consisting of all other locus views. + */ + public Collection> getConflictingViews() { + return Arrays.>asList(LocusView.class,ReadView.class); + } + + /** + * Close this view. + */ + public void close() { + // Set everything to null with the hope of failing fast. + locus = null; + sourceInfo = null; + loci = null; + + super.close(); + } + + /** + * Is there another covered locus context bounded by this view. + * @return True if another covered locus context exists. False otherwise. + */ + public abstract boolean hasNext(); + + /** + * Returns the next covered locus context in the shard. + * @return Next covered locus context in the shard. + * @throw NoSuchElementException if no such element exists. + */ + public abstract AlignmentContext next(); + + /** + * Unsupported. + * @throw UnsupportedOperationException always. + */ + public void remove() { + throw new UnsupportedOperationException("Unable to remove elements from this queue."); + } + + /** + * Is there another locus context bounded by this shard. + * @return True if another locus context is bounded by this shard. + */ + protected boolean hasNextLocus() { + advance(); + return nextLocus != null; + } + + /** + * Get the next locus context bounded by this shard. + * @return Next locus context bounded by this shard. + * @throw NoSuchElementException if the next element is missing. + */ + protected AlignmentContext nextLocus() { + advance(); + if(nextLocus == null) + throw new NoSuchElementException("No more elements remain in locus context queue."); + + // Cache the current and apply filtering. + AlignmentContext current = nextLocus; + + // Indicate that the next operation will need to advance. + nextLocus = null; + + return current; + } + + /** + * Seed the nextLocus variable with the contents of the next locus (if one exists). + */ + private void advance() { + // Already an unclaimed locus present + if(nextLocus != null) + return; + + //System.out.printf("loci is %s%n", loci); + if( !loci.hasNext() ) { + nextLocus = null; + return; + } + + nextLocus = loci.next(); + + // If the location of this shard is available, trim the data stream to match the shard. + // TODO: Much of this functionality is being replaced by the WindowMaker. + if(locus != null) { + // Iterate through any elements not contained within this shard. + while( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) && loci.hasNext() ) + nextLocus = loci.next(); + + // If nothing in the shard was found, indicate that by setting nextLocus to null. + if( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) ) + nextLocus = null; + } + } + + /** + * Is this location contained in the given shard. + * @param location Location to check. + * @return True if the given location is contained within the shard. False otherwise. + */ + private boolean isContainedInShard(GenomeLoc location) { + return locus.containsP(location); + } + + /** + * {@inheritDoc} + * + * Since this class has an actual LIBS, so this function will never throw an exception + * + * @return the LocusIteratorByState used by this view to get pileups + */ + @Override + public LocusIteratorByState getLIBS() { + return loci.getLIBS(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java new file mode 100644 index 000000000..17e8c4290 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java @@ -0,0 +1,116 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +/** + * User: hanna + * Date: May 21, 2009 + * Time: 2:49:17 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A view into the reference-ordered data in the provider. + */ +public class ManagingReferenceOrderedView implements ReferenceOrderedView { + /** + * The data sources along with their current states. + */ + private List states = new ArrayList(); + + /** + * Create a new view of reference-ordered data. + * @param provider + */ + public ManagingReferenceOrderedView( LocusShardDataProvider provider ) { + for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) + states.add(new ReferenceOrderedDataState(dataSource, dataSource.seek(provider.getLocus()))); + + provider.register(this); + } + + public Collection> getConflictingViews() { return Collections.emptyList(); } + + /** + * Gets an object which can track the reference-ordered data at every locus. + * @param loc Locus at which to track. + * @return A tracker containing information about this locus. + */ + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { + if ( states.isEmpty() ) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + List bindings = new ArrayList(states.size()); + + for ( ReferenceOrderedDataState state: states ) + // todo -- warning, I removed the reference to the name from states + bindings.add( state.iterator.seekForward(loc) ); + + return new RefMetaDataTracker(bindings); + } + } + + /** + * Closes the current view. + */ + public void close() { + for( ReferenceOrderedDataState state: states ) + state.dataSource.close( state.iterator ); + + // Clear out the existing data so that post-close() accesses to this data will fail-fast. + states = null; + } +} + +/** + * Models the traversal state of a given ROD lane. + */ +class ReferenceOrderedDataState { + public final ReferenceOrderedDataSource dataSource; + public final LocationAwareSeekableRODIterator iterator; + + public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator ) { + this.dataSource = dataSource; + this.iterator = iterator; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java new file mode 100644 index 000000000..197abd49a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java @@ -0,0 +1,83 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.collections.Pair; + +import java.util.*; + + +/** + * + * @author aaron + * + * Class RODMetaDataContainer + * + * stores both the name and the class for each ROD. This class assumes that: + * + * -Names must be unique + * -Classes are allowed to have duplicates + * + * This class encapsulates the ref data associations, and provides lookup by name and by + * class type. + * + */ +public class RODMetaDataContainer { + // we only allow non-duplicate ROD names, a HashMap is fine + private final HashMap nameMap = new HashMap(); + + // we do allow duplicate class entries, so we need to store pairs of data + private final List> classMap = new ArrayList>(); + + public void addEntry(GATKFeature data) { + nameMap.put(data.getName(),data); + classMap.add(new Pair(data.getClass(),data)); + } + + public Collection getSet(String name) { + if (name == null) return getSet(); + Set set = new HashSet(); + if (nameMap.containsKey(name)) set.add(nameMap.get(name)); + return set; + } + + /** + * get the feature contents of this container; the unfiltered set without their name association + * @return + */ + public Collection getSet() { + return new ArrayList(nameMap.values()); + } + + // the brute force (n) search ended up being faster than sorting and binary search in all but the most extreme cases (thousands of RODs at a location). + public Collection getSet(Class cls) { + Collection ret = new ArrayList(); + for (Pair pair: classMap) + if (pair.first.equals(cls)) ret.add(pair.second); + return ret; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java new file mode 100644 index 000000000..dea8acf5f --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java @@ -0,0 +1,60 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.engine.datasources.reads.ReadShard; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.GenomeLoc; + +/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */ +public class ReadBasedReferenceOrderedView extends IntervalReferenceOrderedView { + public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { + super(provider, provider.hasReferenceOrderedData() ? ((ReadShard)provider.getShard()).getReadsSpan() : null); + } + + /** + * create a RefMetaDataTracker given the current read + * + * @param rec the read + * + * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments + */ + @Requires("rec != null") + @Ensures("result != null") + public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { + if ( rec.getReadUnmappedFlag() ) + return RefMetaDataTracker.EMPTY_TRACKER; + else { + final GenomeLoc readSpan = genomeLocParser.createGenomeLoc(rec); + trimCurrentFeaturesToLoc(readSpan); + return getReferenceOrderedDataForInterval(readSpan); + } + } +} + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java new file mode 100644 index 000000000..c7b2575be --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java @@ -0,0 +1,102 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.GenomeLoc; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * User: hanna + * Date: May 22, 2009 + * Time: 12:36:14 PM + * + */ + +/** Provides access to the reference over a single read. */ + +public class ReadReferenceView extends ReferenceView { + /** + * Create a view of the reference with respect to a single read. + * + * @param provider + */ + public ReadReferenceView( ShardDataProvider provider ) { + super(provider); + } + + protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) { + return new Provider(genomeLoc); + } + + public class Provider implements ReferenceContext.ReferenceContextRefProvider { + GenomeLoc loc; + + public Provider( GenomeLoc loc ) { + this.loc = loc; + } + + public byte[] getBases() { + return getReferenceBases(loc); + } + } + + /** + * Return a reference context appropriate for the span of read + * + * @param read the mapped read to test + * @return + */ + public ReferenceContext getReferenceContext( final SAMRecord read ) { + GenomeLoc loc = genomeLocParser.createGenomeLoc(read); + return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) ); + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java new file mode 100644 index 000000000..f9629f5c8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java @@ -0,0 +1,82 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.util.Collection; + +/** + * Present data sharded by read to a traversal engine. + * + * @author mhanna + * @version 0.1 + */ +public class ReadShardDataProvider extends ShardDataProvider { + /** + * The raw collection of reads. + */ + private final GATKSAMIterator reads; + + /** + * Create a data provider for the shard given the reads and reference. + * @param shard The chunk of data over which traversals happen. + * @param reference A getter for a section of the reference. + */ + public ReadShardDataProvider(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator reads, IndexedFastaSequenceFile reference, Collection rods) { + super(shard,genomeLocParser,reference,rods); + this.reads = reads; + } + + /** + * Can this data source provide reads? + * @return True if reads are available, false otherwise. + */ + public boolean hasReads() { + return reads != null; + } + + /** + * Gets an iterator over all the reads bound by this shard. + * @return An iterator over all reads in this shard. + */ + public GATKSAMIterator getReadIterator() { + return reads; + } + + @Override + public void close() { + super.close(); + + if(reads != null) + reads.close(); + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java new file mode 100644 index 000000000..ec879fdfd --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java @@ -0,0 +1,88 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Arrays; +import java.util.Collection; +/** + * User: hanna + * Date: May 22, 2009 + * Time: 12:06:54 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A view into the reads that a provider can provide. + */ +public class ReadView implements View, Iterable { + /** + * The iterator into the reads supplied by this provider. + */ + private GATKSAMIterator reads; + + /** + * Create a new view of the reads given the current data set. + * @param provider Source for the data. + */ + public ReadView( ReadShardDataProvider provider ) { + reads = provider.getReadIterator(); + } + + /** + * Other reads and loci conflict with this view. + * @return Array of reads and loci. + */ + public Collection> getConflictingViews() { + return Arrays.>asList(ReadView.class, LocusView.class); + } + + /** + * Close the view over these reads. Note that this method closes just + * the view into the reads, not the reads themselves. + */ + public void close() { + // Don't close the reads. The provider is responsible for this. + // Just dispose of the pointer. + reads = null; + } + + /** + * Gets an iterator into the reads supplied by this provider. + * @return Iterator into the reads that this provider covers. + */ + public GATKSAMIterator iterator() { + return reads; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java new file mode 100644 index 000000000..3be983d4a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java @@ -0,0 +1,33 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.GenomeLoc; + +public interface ReferenceOrderedView extends View { + RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ); +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceView.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java new file mode 100644 index 000000000..297ccbedd --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java @@ -0,0 +1,196 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.collections.RODMergingIterator; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; + +import java.util.*; + +/** + * A view into the reference-ordered data in the provider. + */ +public class RodLocusView extends LocusView implements ReferenceOrderedView { + /** + * The data sources along with their current states. + */ + private RODMergingIterator rodQueue = null; + + Collection allTracksHere; + + GenomeLoc lastLoc = null; + RODRecordList interval = null; + + /** + * The data sources along with their current states. + */ + private List states = new ArrayList(); + + /** + * Enable debugging output -- todo remove me + */ + final static boolean DEBUG = false; + + final static String INTERVAL_ROD_NAME = "interval"; + + /** + * Create a new view of reference-ordered data. + * + * @param provider + */ + public RodLocusView( LocusShardDataProvider provider ) { + super(provider); + + GenomeLoc loc = provider.getLocus(); + + List< Iterator > iterators = new LinkedList< Iterator >(); + for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) { + if ( DEBUG ) System.out.printf("Shard is %s%n", provider.getLocus()); + + // grab the ROD iterator from the data source, and compute the first location in this shard, forwarding + // the iterator to immediately before it, so that it can be added to the merging iterator primed for + // next() to return the first real ROD in this shard + LocationAwareSeekableRODIterator it = dataSource.seek(provider.getLocus()); + it.seekForward(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart()-1)); + + states.add(new ReferenceOrderedDataState(dataSource,it)); + + // we need to special case the interval so we don't always think there's a rod at the first location + if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) { + if ( interval != null ) + throw new RuntimeException("BUG: interval local variable already assigned " + interval); + interval = it.next(); + } else { + iterators.add( it ); + } + } + + rodQueue = new RODMergingIterator(iterators); + } + + @Override + public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { + // special case the interval again -- add it into the ROD + if ( interval != null ) { allTracksHere.add(interval); } + return new RefMetaDataTracker(allTracksHere); + } + + public boolean hasNext() { + if ( ! rodQueue.hasNext() ) + return false; + else { + return ! rodQueue.peekLocation().isPast(locus); + } + } + + /** + * Returns the next covered locus context in the shard. + * @return Next covered locus context in the shard. + * @throw NoSuchElementException if no such element exists. + */ + public AlignmentContext next() { + if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n"); + RODRecordList datum = rodQueue.next(); + if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation()); + + if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n"); + + allTracksHere = getSpanningTracks(datum); + GenomeLoc rodSite = datum.getLocation(); + GenomeLoc site = genomeLocParser.createGenomeLoc( rodSite.getContig(), rodSite.getStart(), rodSite.getStart()); + + if ( DEBUG ) System.out.printf("rodLocusView.next() is at %s%n", site); + + // calculate the number of skipped bases, and update lastLoc so we can do that again in the next() + long skippedBases = getSkippedBases( rodSite ); + lastLoc = site; + return new AlignmentContext(site, new ReadBackedPileupImpl(site), skippedBases); + } + + private Collection getSpanningTracks(RODRecordList marker) { + return rodQueue.allElementsLTE(marker); + } + + /** + * Returns the number of reference bases that have been skipped: + * + * 1 -- since the last processed location if we have one + * 2 -- from the beginning of the shard if this is the first loc + * 3 -- from the last location to the current position + * + * @param currentPos + * @return + */ + private long getSkippedBases( GenomeLoc currentPos ) { + // the minus - is because if lastLoc == null, you haven't yet seen anything in this interval, so it should also be counted as skipped + Integer compStop = lastLoc == null ? locus.getStart() - 1 : lastLoc.getStop(); + long skippedBases = currentPos.getStart() - compStop - 1; + + if ( skippedBases < -1 ) { // minus 1 value is ok + throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s", + skippedBases, currentPos, lastLoc, locus)); + } + return Math.max(skippedBases, 0); + } + + /** + * Get the location one after the last position we will traverse through + * @return + */ + public GenomeLoc getLocOneBeyondShard() { + return genomeLocParser.createGenomeLoc(locus.getContig(),locus.getStop()+1); + } + + /** + * How many bases are we skipping from the current location to the end of the interval / shard + * if we have no more elements + * + * @return + */ + public long getLastSkippedBases() { + if ( hasNext() ) + throw new RuntimeException("BUG: getLastSkippedBases called when there are elements remaining."); + + return getSkippedBases(getLocOneBeyondShard()); + } + + /** + * Closes the current view. + */ + public void close() { + for( ReferenceOrderedDataState state: states ) + state.dataSource.close( state.iterator ); + + rodQueue = null; + allTracksHere = null; + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProvider.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/View.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java new file mode 100644 index 000000000..178d440bf --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java @@ -0,0 +1,170 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.util.BlockCompressedFilePointerUtil; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.LinkedList; +import java.util.List; + +/** +* Created by IntelliJ IDEA. +* User: mhanna +* Date: 10/14/11 +* Time: 10:47 PM +* To change this template use File | Settings | File Templates. +*/ +class BAMAccessPlan { + private final SAMReaderID reader; + private final BlockInputStream inputStream; + + private final List positions; + private PeekableIterator positionIterator; + + /** + * Stores the next block address to read, or -1 if no such block is available. + */ + private long nextBlockAddress; + + + BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { + this.reader = reader; + this.inputStream = inputStream; + + this.positions = fileSpan.getGATKChunks(); + initialize(); + } + + public SAMReaderID getReader() { + return reader; + } + + public BlockInputStream getInputStream() { + return inputStream; + } + + /** + * Retrieves the next block address to be read. + * @return Next block address to be read. + */ + public long getBlockAddress() { + return nextBlockAddress; + } + + /** + * Retrieves the first offset of interest in the block returned by getBlockAddress(). + * @return First block of interest in this segment. + */ + public int getFirstOffsetInBlock() { + return (nextBlockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; + } + + /** + * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer. + * @param blockAddress Block address for which to search. + * @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span. + * @return list of chunks containing that block. + */ + public List getSpansOverlappingBlock(long blockAddress, long filePosition) { + List spansOverlapping = new LinkedList(); + // While the position iterator overlaps the given block, pull out spans to report. + while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) { + // Create a span over as much of the block as is covered by this chunk. + int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; + + // Calculate the end of this span. If the span extends past this block, cap it using the current file position. + long blockEnd; + int blockOffsetEnd; + if(blockAddress < positionIterator.peek().getBlockEnd()) { + blockEnd = filePosition; + blockOffsetEnd = 0; + } + else { + blockEnd = positionIterator.peek().getBlockEnd(); + blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd(); + } + + GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd); + + if(newChunk.getChunkStart() <= newChunk.getChunkEnd()) + spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd)); + + // If the value currently stored in the position iterator ends past the current block, we must be done. Abort. + if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress) + break; + + // If the position iterator ends before the block ends, pull the position iterator forward. + if(positionIterator.peek().getBlockEnd() <= blockAddress) + positionIterator.next(); + } + + return spansOverlapping; + } + + public void reset() { + initialize(); + } + + /** + * Resets the SAM reader position to its original state. + */ + private void initialize() { + this.positionIterator = new PeekableIterator(positions.iterator()); + if(positionIterator.hasNext()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + else + nextBlockAddress = -1; + } + + /** + * Advances the current position to the next block to read, given the current position in the file. + * @param filePosition The current position within the file. + */ + void advancePosition(final long filePosition) { + nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition); + + // Check the current file position against the iterator; if the iterator is before the current file position, + // draw the iterator forward. Remember when performing the check that coordinates are half-open! + while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) + positionIterator.next(); + + // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. + if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) + nextBlockAddress = positionIterator.peek().getBlockStart(); + + // If we've shot off the end of the block pointer, notify consumers that iteration is complete. + if(!positionIterator.hasNext()) + nextBlockAddress = -1; + } + + private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { + return filePosition >= chunk.getChunkEnd(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java new file mode 100644 index 000000000..aca33e411 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java @@ -0,0 +1,531 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.Bin; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.util.*; + +/** + * Writes schedules for a single BAM file to a target output file. + */ +public class BAMSchedule implements CloseableIterator { + /** + * File in which to store schedule data. + */ + private File scheduleFile; + + /** + * File channel for the schedule file. + */ + private FileChannel scheduleFileChannel; + + /** + * The definitive, sorted list of reader IDs. Order is important here: the order + * in which the reader IDs are presented here maps to the order in which they appear in the file. + */ + private final List readerIDs = new ArrayList(); + + /** + * Iterators over the schedule. Stored in the same order as readerIDs, above. + */ + private final List> scheduleIterators = new ArrayList>(); + + /** + * Next schedule entry to be returned. Null if no additional entries are present. + */ + private BAMScheduleEntry nextScheduleEntry; + + /** + * Reference sequence for which to write the schedule. + */ + private final int referenceSequence; + + /** + * Sizes of ints and longs in bytes. + */ + private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; + private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; + + /** + * Create a new BAM schedule based on the given index. + * @param dataSource The SAM data source to use. + * @param intervals List of + */ + public BAMSchedule(final SAMDataSource dataSource, final List intervals) { + if(intervals.isEmpty()) + throw new ReviewedGATKException("Tried to write schedule for empty interval list."); + + referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); + + createScheduleFile(); + + readerIDs.addAll(dataSource.getReaderIDs()); + + for(final SAMReaderID reader: readerIDs) { + final GATKBAMIndex index = dataSource.getIndex(reader); + final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); + + int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); + Iterator locusIterator = intervals.iterator(); + GenomeLoc currentLocus = locusIterator.next(); + + final long readerStartOffset = position(); + + int maxChunkCount = 0; + + while(currentBinInLowestLevel < GATKBAMIndex.MAX_BINS && currentLocus != null) { + final Bin bin = new Bin(referenceSequence,currentBinInLowestLevel); + final int binStart = index.getFirstLocusInBin(bin); + final int binStop = index.getLastLocusInBin(bin); + + // In required, pull bin iterator ahead to the point of the next GenomeLoc. + if(binStop < currentLocus.getStart()) { + currentBinInLowestLevel++; + continue; + } + + // At this point, the bin stop is guaranteed to be >= the start of the locus. + // If the bins have gone past the current locus, update the current locus if at all possible. + if(binStart > currentLocus.getStop()) { + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; + continue; + } + + // Code at this point knows that the current bin is neither before nor after the current locus, + // so it must overlap. Add this region to the filesystem. + final GATKBAMFileSpan fileSpan = indexData.getSpanOverlapping(bin); + + if(!fileSpan.isEmpty()) { + // File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves. + ByteBuffer buffer = allocateByteBuffer(2*INT_SIZE_IN_BYTES + INT_SIZE_IN_BYTES + fileSpan.getGATKChunks().size()*LONG_SIZE_IN_BYTES*2); + buffer.putInt(binStart); + buffer.putInt(binStop); + buffer.putInt(fileSpan.getGATKChunks().size()); + for(GATKChunk chunk: fileSpan.getGATKChunks()) { + buffer.putLong(chunk.getChunkStart()); + buffer.putLong(chunk.getChunkEnd()); + } + maxChunkCount = Math.max(maxChunkCount,fileSpan.getGATKChunks().size()); + + // Prepare buffer for writing + buffer.flip(); + + // And write. + write(buffer); + } + + currentBinInLowestLevel++; + } + + final long readerStopOffset = position(); + + scheduleIterators.add(new PeekableIterator(new BAMScheduleIterator(reader,readerStartOffset,readerStopOffset,maxChunkCount))); + + // Iterator initialization might move the file pointer. Make sure it gets reset back to where it was before iterator initialization. + position(readerStopOffset); + } + + advance(); + } + + /** + * Determine whether more ScheduleEntries are present in the iterator. + * @return Next schedule entry to parse. + */ + @Override + public boolean hasNext() { + return nextScheduleEntry != null; + } + + /** + * Retrieve the next schedule entry in the list. + * @return next schedule entry in the queue. + */ + @Override + public BAMScheduleEntry next() { + BAMScheduleEntry currentScheduleEntry = nextScheduleEntry; + advance(); + return currentScheduleEntry; + } + + /** + * Close down and delete the file. + */ + @Override + public void close() { + try { + scheduleFileChannel.close(); + } + catch(IOException ex) { + throw makeIOFailureException(true, "Unable to close schedule file.", ex); + } + } + + /** + * Convenience routine for creating UserExceptions + * @param wasWriting + * @param message + * @param e + * @return + */ + private final GATKException makeIOFailureException(final boolean wasWriting, final String message, final Exception e) { + if ( wasWriting ) { + if ( e == null ) + return new UserException.CouldNotCreateOutputFile(scheduleFile, message); + else + return new UserException.CouldNotCreateOutputFile(scheduleFile, message, e); + } else { + if ( e == null ) + return new UserException.CouldNotReadInputFile(scheduleFile, message); + else + return new UserException.CouldNotReadInputFile(scheduleFile, message, e); + } + } + + /** + * Advance to the next schedule entry. + */ + private void advance() { + nextScheduleEntry = null; + + BitSet selectedIterators = new BitSet(readerIDs.size()); + int currentStart = Integer.MAX_VALUE; + int currentStop = Integer.MAX_VALUE; + + // Select every iterator whose next element is the lowest element in the list. + for(int reader = 0; reader < scheduleIterators.size(); reader++) { + PeekableIterator scheduleIterator = scheduleIterators.get(reader); + if(!scheduleIterator.hasNext()) + continue; + + // If the iterator starts after this one, skip over it. + if(scheduleIterator.peek().start > currentStart) + continue; + + // If the iterator starts at the same point as this one, add it to the list. + if(scheduleIterator.peek().start == currentStart) { + selectedIterators.set(reader); + currentStop = Math.min(scheduleIterator.peek().stop,currentStop); + continue; + } + + // If the iterator is less than anything seen before it, purge the selections and make this one current. + if(scheduleIterator.peek().start < currentStart) { + selectedIterators.clear(); + selectedIterators.set(reader); + currentStart = scheduleIterator.peek().start; + currentStop = scheduleIterator.peek().stop; + } + } + + // Out of iterators? Abort early. + if(selectedIterators.isEmpty()) + return; + + // Create the target schedule entry + BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); + + // For each schedule entry with data, load the data into the merged schedule. + for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { + PeekableIterator scheduleIterator = scheduleIterators.get(reader); + BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); + mergedScheduleEntry.mergeInto(individualScheduleEntry); + + // If the schedule iterator ends after this entry, consume it. + if(individualScheduleEntry.stop <= currentStop) + scheduleIterator.next(); + } + + // For each schedule entry without data, add a blank entry. + for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { + mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); + } + + nextScheduleEntry = mergedScheduleEntry; + } + + @Override + public void remove() { throw new UnsupportedOperationException("Unable to remove from a schedule iterator."); } + + /** + * Create a new schedule file, containing schedule information for all BAM files being dynamically merged. + */ + private void createScheduleFile() { + try { + scheduleFile = File.createTempFile("bamschedule."+referenceSequence,null); + scheduleFileChannel = new RandomAccessFile(scheduleFile,"rw").getChannel(); + } + catch(IOException ex) { + throw new UserException("Unable to create a temporary BAM schedule file. Please make sure Java can write to the default temp directory or use -Djava.io.tmpdir= to instruct it to use a different temp directory instead.",ex); + } + scheduleFile.deleteOnExit(); + + } + + /** + * Creates a new byte buffer of the given size. + * @param size the size of buffer to allocate. + * @return Newly allocated byte buffer. + */ + private ByteBuffer allocateByteBuffer(final int size) { + ByteBuffer buffer = ByteBuffer.allocate(size); + buffer.order(ByteOrder.LITTLE_ENDIAN); + return buffer; + } + + /** + * Reads the contents at the current position on disk into the given buffer. + * @param buffer buffer to fill. + */ + private int read(final ByteBuffer buffer) { + try { + return scheduleFileChannel.read(buffer); + } + catch(IOException ex) { + throw makeIOFailureException(false, "Unable to read data from BAM schedule file.", ex); + } + } + + private void write(final ByteBuffer buffer) { + try { + scheduleFileChannel.write(buffer); + if(buffer.remaining() > 0) + throw makeIOFailureException(true, "Unable to write entire buffer to file.", null); + } + catch(IOException ex) { + throw makeIOFailureException(true, "Unable to write data to BAM schedule file.", ex); + } + } + + /** + * Reads the current position from the file channel. + * @return Current position within file channel. + */ + private long position() { + try { + return scheduleFileChannel.position(); + } + catch(IOException ex) { + throw makeIOFailureException(false, "Unable to retrieve position of BAM schedule file.", ex); + } + } + + /** + * Reposition the file channel to the specified offset wrt the start of the file. + * @param position The position. + */ + private void position(final long position) { + try { + scheduleFileChannel.position(position); + } + catch(IOException ex) { + throw makeIOFailureException(false, "Unable to position BAM schedule file.",ex); + } + } + + /** + * An iterator over the schedule for a single BAM file. + */ + private class BAMScheduleIterator implements Iterator { + /** + * ID of the reader associated with the given schedule. + */ + private final SAMReaderID reader; + + /** + * Current position in the file. + */ + private long currentPosition; + + /** + * Stopping file position of last bin in file for this reader, exclusive. + */ + private final long stopPosition; + + /** + * Byte buffer used to store BAM header info. + */ + private final ByteBuffer binHeader; + + /** + * Byte buffer used to store chunk data. + */ + private final ByteBuffer chunkData; + + public BAMScheduleIterator(final SAMReaderID reader, final long startPosition, final long stopPosition, final int maxChunkCount) { + this.reader = reader; + this.currentPosition = startPosition; + this.stopPosition = stopPosition; + binHeader = allocateByteBuffer(INT_SIZE_IN_BYTES*3); + chunkData = allocateByteBuffer(maxChunkCount*LONG_SIZE_IN_BYTES*2); + } + + @Override + public boolean hasNext() { + return currentPosition < stopPosition; + } + + @Override + public BAMScheduleEntry next() { + position(currentPosition); + + // Read data. + int binHeaderBytesRead = read(binHeader); + + // Make sure we read in a complete bin header: + if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) { + throw new ReviewedGATKException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " + + "The BAM schedule file is likely incomplete/corrupt.", + scheduleFile.getAbsolutePath(), reader.getSamFilePath())); + } + + // Decode contents. + binHeader.flip(); + final int start = binHeader.getInt(); + final int stop = binHeader.getInt(); + final int numChunks = binHeader.getInt(); + + // Prepare bin buffer for next read. + binHeader.flip(); + + // Prepare a target buffer for chunks. + GATKChunk[] chunks = new GATKChunk[numChunks]; + + // Read all chunk data. + chunkData.limit(numChunks*LONG_SIZE_IN_BYTES*2); + long bytesRead = read(chunkData); + if(bytesRead != numChunks*LONG_SIZE_IN_BYTES*2) + throw new ReviewedGATKException("Unable to read all chunks from file"); + + // Prepare for reading. + chunkData.flip(); + + for(int i = 0; i < numChunks; i++) + chunks[i] = new GATKChunk(chunkData.getLong(),chunkData.getLong()); + + // Prepare chunk buffer for next read. + chunkData.flip(); + + BAMScheduleEntry nextScheduleEntry = new BAMScheduleEntry(start,stop); + nextScheduleEntry.addFileSpan(reader,new GATKBAMFileSpan(chunks)); + + // Reset the position of the iterator at the next contig. + currentPosition = position(); + + return nextScheduleEntry; + } + + /** + * Not supported. + */ + @Override + public void remove() { + throw new UnsupportedOperationException("Unable to remove from a BAMScheduleIterator"); + } + + } +} + +/** + * A single proto-shard to be processed. + */ +class BAMScheduleEntry { + /** + * Starting position for the genomic entry. + */ + public final int start; + + /** + * Ending position for the genomic entry. + */ + public final int stop; + + /** + * The spans representing the given region. + */ + public final Map fileSpans = new HashMap(); + + BAMScheduleEntry(final int start, final int stop) { + this.start = start; + this.stop = stop; + } + + /** + * Add a new file span to this schedule. + * @param reader Reader associated with the span. + * @param fileSpan Blocks to read in the given reader. + */ + public void addFileSpan(final SAMReaderID reader, final GATKBAMFileSpan fileSpan) { + fileSpans.put(reader,fileSpan); + } + + /** + * A naive merge operation. Merge the fileSpans in other into this, blowing up if conflicts are + * detected. Completely ignores merging start and stop. + * @param other Other schedule entry to merging into this one. + */ + public void mergeInto(final BAMScheduleEntry other) { + final int thisSize = fileSpans.size(); + final int otherSize = other.fileSpans.size(); + fileSpans.putAll(other.fileSpans); + if(fileSpans.size() != thisSize+otherSize) + throw new ReviewedGATKException("Unable to handle overlaps when merging BAM schedule entries."); + } + + /** + * Returns true if the location of this bin tree is before the given position. + * @param locus Locus to test. + * @return True if this bin sits completely before the given locus; false otherwise. + */ + public boolean isBefore(final GenomeLoc locus) { + return stop < locus.getStart(); + } + + /** + * Checks overlap between this bin tree and other bin trees. + * @param position the position over which to detect overlap. + * @return True if the segment overlaps. False otherwise. + */ + public boolean overlaps(final GenomeLoc position) { + return !(position.getStop() < start || position.getStart() > stop); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java new file mode 100644 index 000000000..f916bc185 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java @@ -0,0 +1,321 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.*; + +/** + * Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once. + */ +public class BAMScheduler implements Iterator { + private final SAMDataSource dataSource; + + private final Map indexFiles = new HashMap(); + + private FilePointer nextFilePointer = null; + + private GenomeLocSortedSet loci; + private PeekableIterator locusIterator; + private GenomeLoc currentLocus; + private IntervalMergingRule intervalMergingRule; + + /* + * Creates BAMScheduler using contigs from the given BAM data source. + * + * @param dataSource BAM source + * @return non-null BAM scheduler + */ + public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { + final BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); + final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); + scheduler.populateFilteredIntervalList(intervals); + return scheduler; + } + + public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { + BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); + scheduler.populateUnfilteredIntervalList(parser); + return scheduler; + } + + public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final IntervalMergingRule mergeRule, final GenomeLocSortedSet loci) { + BAMScheduler scheduler = new BAMScheduler(dataSource, mergeRule); + scheduler.populateFilteredIntervalList(loci); + return scheduler; + } + + + private BAMScheduler(final SAMDataSource dataSource, final IntervalMergingRule mergeRule) { + this.dataSource = dataSource; + this.intervalMergingRule = mergeRule; + for(SAMReaderID reader: dataSource.getReaderIDs()) { + GATKBAMIndex index = dataSource.getIndex(reader); + if(index != null) + indexFiles.put(reader,dataSource.getIndex(reader)); + } + } + + /** + * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. + * @param loci The list of locations to search and iterate over. + */ + private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { + this.loci = loci; + if(!indexFiles.isEmpty()) { + // If index data is available, start up the iterator. + locusIterator = new PeekableIterator(loci.iterator()); + if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + advance(); + } + else { + // Otherwise, seed the iterator with a single file pointer over the entire region. + nextFilePointer = generatePointerOverEntireFileset(); + for(GenomeLoc locus: loci) + nextFilePointer.addLocation(locus); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + } + } + + /** + * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching + * from just before the start of the region to the end of the region. + */ + private void populateUnfilteredIntervalList(final GenomeLocParser parser) { + this.loci = new GenomeLocSortedSet(parser); + locusIterator = new PeekableIterator(Collections.emptyList().iterator()); + nextFilePointer = generatePointerOverEntireFileset(); + } + + /** + * Generate a span that runs from the end of the BAM header to the end of the fle. + * @return A file pointer over the specified region. + */ + private FilePointer generatePointerOverEntireFileset() { + FilePointer filePointer = new FilePointer(intervalMergingRule); + + // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is + // the only FilePointer we will create. This allows us to have this FilePointer represent regions from + // multiple contigs + filePointer.setIsMonolithic(true); + + Map currentPosition; + + currentPosition = dataSource.getInitialReaderPositions(); + + for(SAMReaderID reader: dataSource.getReaderIDs()) + filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); + return filePointer; + } + + public boolean hasNext() { + return nextFilePointer != null; + } + + public FilePointer next() { + if(!hasNext()) + throw new NoSuchElementException("No next element available in interval sharder"); + FilePointer currentFilePointer = nextFilePointer; + nextFilePointer = null; + advance(); + + return currentFilePointer; + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder"); + } + + private void advance() { + if(loci.isEmpty()) + return; + + while(nextFilePointer == null && currentLocus != null) { + // special case handling of the unmapped shard. + if(currentLocus == GenomeLoc.UNMAPPED) { + nextFilePointer = new FilePointer(intervalMergingRule, GenomeLoc.UNMAPPED); + for(SAMReaderID id: dataSource.getReaderIDs()) + nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); + currentLocus = null; + continue; + } + + nextFilePointer = new FilePointer(intervalMergingRule); + + int coveredRegionStart = 1; + int coveredRegionStop = Integer.MAX_VALUE; + GenomeLoc coveredRegion = null; + + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); + + // No overlapping data at all. + if(scheduleEntry != null) { + coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start); + coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop); + coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop); + + nextFilePointer.addFileSpans(scheduleEntry.fileSpans); + } + else { + // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. + for(SAMReaderID reader: indexFiles.keySet()) + nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); + } + + // Early exit if no bins were found. + if(coveredRegion == null) { + // for debugging only: maximum split is 16384. + nextFilePointer.addLocation(currentLocus); + currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; + continue; + } + + // Early exit if only part of the first interval was found. + if(currentLocus.startsBefore(coveredRegion)) { + int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); + GenomeLoc[] splitContigs = currentLocus.split(splitPoint); + nextFilePointer.addLocation(splitContigs[0]); + currentLocus = splitContigs[1]; + continue; + } + + // Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list. + GenomeLoc initialLocation = currentLocus.intersect(coveredRegion); + nextFilePointer.addLocation(initialLocation); + + // See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval. + if(!nextFilePointer.locations.isEmpty()) { + while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) { + currentLocus = locusIterator.next(); + nextFilePointer.addLocation(currentLocus.intersect(coveredRegion)); + } + + // Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus, + // we can simplify the interval creation process to the end of the covered region to the stop of the given interval. + if(coveredRegionStop < currentLocus.getStop()) + currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop()); + else if(locusIterator.hasNext()) + currentLocus = locusIterator.next(); + else + currentLocus = null; + } + + } + } + + + /** + * The last reference sequence processed by this iterator. + */ + private Integer lastReferenceSequenceLoaded = null; + + /** + * The stateful iterator used to progress through the genoem. + */ + private PeekableIterator bamScheduleIterator = null; + + /** + * Clean up underlying BAMSchedule file handles. + */ + public void close() { + if(bamScheduleIterator != null) + bamScheduleIterator.close(); + } + + /** + * Get the next overlapping tree of bins associated with the given BAM file. + * @param currentLocus The actual locus for which to check overlap. + * @return The next schedule entry overlapping with the given list of loci. + */ + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { + // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. + // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then + // we'll be using the correct contig index for the BAMs. + // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. + SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig()); + if ( currentContigSequenceRecord == null ) { + throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s", + currentLocus.getContig(), + ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary()))); + } + + final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex(); + + // Stale reference sequence or first invocation. (Re)create the binTreeIterator. + if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { + if(bamScheduleIterator != null) + bamScheduleIterator.close(); + lastReferenceSequenceLoaded = currentContigIndex; + + // Naive algorithm: find all elements in current contig for proper schedule creation. + List lociInContig = new LinkedList(); + for(GenomeLoc locus: loci) { + if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()) == null) + throw new ReviewedGATKException("BAM file(s) do not have the contig: " + locus.getContig() + ". You are probably using a different reference than the one this file was aligned with"); + + if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) + lociInContig.add(locus); + } + + bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); + } + + if(!bamScheduleIterator.hasNext()) + return null; + + // Peek the iterator along until finding the first binTree at or following the current locus. + BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek(); + while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) { + bamScheduleIterator.next(); + bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null; + } + + return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; + } + + /** + * Create a span from the given start point to the end of the file. + * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset). + * @return A file span from the given point to the end of the file. + */ + private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) { + return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE)); + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BGZFBlockLoadingDispatcher.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java new file mode 100644 index 000000000..125d4f731 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java @@ -0,0 +1,451 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.util.BlockCompressedInputStream; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * Presents decompressed blocks to the SAMFileReader. + */ +public class BlockInputStream extends InputStream { + /** + * Mechanism for triggering block loads. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * The reader whose data is supplied by this input stream. + */ + private final SAMReaderID reader; + + /** + * Length of the input stream. + */ + private final long length; + + /** + * The latest error reported by an asynchronous block load. + */ + private Throwable error; + + /** + * Current accessPlan. + */ + private BAMAccessPlan accessPlan; + + /** + * A stream of compressed data blocks. + */ + private final ByteBuffer buffer; + + /** + * Offsets of the given blocks in the buffer. + */ + private LinkedList blockOffsets = new LinkedList(); + + /** + * Source positions of the given blocks in the buffer. + */ + private LinkedList blockPositions = new LinkedList(); + + /** + * Provides a lock to wait for more data to arrive. + */ + private final Object lock = new Object(); + + /** + * An input stream to use when comparing data back to what it should look like. + */ + private final BlockCompressedInputStream validatingInputStream; + + /** + * Create a new block presenting input stream with a dedicated buffer. + * @param dispatcher the block loading messenger. + * @param reader the reader for which to load data. + * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. + */ + BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { + this.reader = reader; + this.length = reader.getSamFile().length(); + + buffer = ByteBuffer.wrap(new byte[64*1024]); + buffer.order(ByteOrder.LITTLE_ENDIAN); + + // The state of the buffer assumes that the range of data written into the buffer appears in the range + // [position,limit), while extra capacity exists in the range [limit,capacity) + buffer.limit(0); + + this.dispatcher = dispatcher; + // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. + this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); + + // The block offsets / block positions guarantee that the ending offset/position in the data structure maps to + // the point in the file just following the last read. These two arrays should never be empty; initializing + // to 0 to match the position above. + this.blockOffsets.add(0); + this.blockPositions.add(0L); + + try { + if(validate) { + System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); + validatingInputStream = new BlockCompressedInputStream(reader.getSamFile()); + // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. + // Poke the stream to start reading data. + validatingInputStream.available(); + } + else + validatingInputStream = null; + } + catch(IOException ex) { + throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); + } + } + + public long length() { + return length; + } + + public long getFilePointer() { + long filePointer; + synchronized(lock) { + // Find the current block within the input stream. + int blockIndex; + for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++) + ; + filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex)); + } + +// if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) +// throw new ReviewedGATKException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", +// BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()), +// BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer))); + + return filePointer; + } + + private void clearBuffers() { + this.accessPlan.reset(); + + // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. + // Indicate no data to be read. + buffer.clear(); + buffer.limit(0); + + // Clear everything except the last block offset / position + blockOffsets.clear(); + blockOffsets.add(0); + while(blockPositions.size() > 1) + blockPositions.removeFirst(); + } + + public boolean eof() { + synchronized(lock) { + // TODO: Handle multiple empty BGZF blocks at end of the file. + return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length); + } + } + + /** + * Submits a new access plan for the given dataset and seeks to the given point. + * @param accessPlan The next seek point for BAM data in this reader. + */ + public void submitAccessPlan(final BAMAccessPlan accessPlan) { + //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); + this.accessPlan = accessPlan; + accessPlan.reset(); + + clearBuffers(); + + // Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc). + // TODO: Don't pass these empty chunks in. + accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0)); + + if(accessPlan.getBlockAddress() >= 0) { + waitForBufferFill(); + } + + if(validatingInputStream != null) { + try { + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0)); + } + catch(IOException ex) { + throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); + } + } + + } + + + private void compactBuffer() { + // Compact buffer to maximize storage space. + int bytesToRemove = 0; + + // Look ahead to see if we can compact away the first blocks in the series. + while(blockOffsets.size() > 1 && buffer.position() >= blockOffsets.get(1)) { + blockOffsets.remove(); + blockPositions.remove(); + bytesToRemove = blockOffsets.peek(); + } + + // If we end up with an empty block at the end of the series, compact this as well. + if(buffer.remaining() == 0 && blockOffsets.size() > 1 && buffer.position() >= blockOffsets.peek()) { + bytesToRemove += buffer.position(); + blockOffsets.remove(); + blockPositions.remove(); + } + + int finalBufferStart = buffer.position() - bytesToRemove; + int finalBufferSize = buffer.remaining(); + + // Position the buffer to remove the unneeded data, and compact it away. + buffer.position(bytesToRemove); + buffer.compact(); + + // Reset the limits for reading. + buffer.position(finalBufferStart); + buffer.limit(finalBufferStart+finalBufferSize); + + // Shift everything in the offset buffer down to accommodate the bytes removed from the buffer. + for(int i = 0; i < blockOffsets.size(); i++) + blockOffsets.set(i,blockOffsets.get(i)-bytesToRemove); + } + + /** + * Push contents of incomingBuffer into the end of this buffer. + * MUST be called from a thread that is NOT the reader thread. + * @param incomingBuffer The data being pushed into this input stream. + * @param accessPlan target access plan for the data. + * @param filePosition the current position of the file pointer + */ + public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) { + synchronized(lock) { + try { + if(validatingInputStream != null) { + byte[] validBytes = new byte[incomingBuffer.remaining()]; + + byte[] currentBytes = new byte[incomingBuffer.remaining()]; + int pos = incomingBuffer.position(); + int lim = incomingBuffer.limit(); + incomingBuffer.get(currentBytes); + + incomingBuffer.limit(lim); + incomingBuffer.position(pos); + + long currentFilePointer = validatingInputStream.getFilePointer(); + validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0)); + validatingInputStream.read(validBytes); + validatingInputStream.seek(currentFilePointer); + + if(!Arrays.equals(validBytes,currentBytes)) + throw new ReviewedGATKException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); + } + + compactBuffer(); + // Open up the buffer for more reading. + buffer.limit(buffer.capacity()); + + // Get the spans overlapping this particular block... + List spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition); + + // ...and advance the block + this.accessPlan = accessPlan; + accessPlan.advancePosition(makeFilePointer(filePosition, 0)); + + if(buffer.remaining() < incomingBuffer.remaining()) + lock.wait(); + + final int bytesInIncomingBuffer = incomingBuffer.limit(); + + for(GATKChunk spanOverlapping: spansOverlapping) { + // Clear out the endcap tracking state and add in the starting position for this transfer. + blockOffsets.removeLast(); + blockOffsets.add(buffer.position()); + blockPositions.removeLast(); + blockPositions.add(spanOverlapping.getChunkStart()); + + // Stream the buffer into the data stream. + incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd()); + incomingBuffer.position(spanOverlapping.getBlockOffsetStart()); + buffer.put(incomingBuffer); + + // Add the endcap for this transfer. + blockOffsets.add(buffer.position()); + blockPositions.add(spanOverlapping.getChunkEnd()); + } + + // Set up the buffer for reading. + buffer.flip(); + + lock.notify(); + } + catch(Exception ex) { + reportException(ex); + lock.notify(); + } + } + } + + void reportException(Throwable t) { + synchronized(lock) { + this.error = t; + lock.notify(); + } + } + + private void checkForErrors() { + synchronized(lock) { + if(error != null) { + ReviewedGATKException toThrow = new ReviewedGATKException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); + toThrow.setStackTrace(error.getStackTrace()); + throw toThrow; + } + } + } + + /** + * Reads the next byte of data from the input stream. + * @return Next byte of data, from 0->255, as an int. + */ + @Override + public int read() { + byte[] singleByte = new byte[1]; + read(singleByte); + return singleByte[0]; + } + + /** + * Fills the given byte array to the extent possible. + * @param bytes byte array to be filled. + * @return The number of bytes actually read. + */ + @Override + public int read(byte[] bytes) { + return read(bytes,0,bytes.length); + } + + @Override + public int read(byte[] bytes, final int offset, final int length) { + int remaining = length; + synchronized(lock) { + while(remaining > 0) { + // Check for error conditions during last read. + checkForErrors(); + + // If completely out of space, queue up another buffer fill. + waitForBufferFill(); + + // Couldn't manage to load any data at all; abort and return what's available. + if(buffer.remaining() == 0) + break; + + int numBytesToCopy = Math.min(buffer.remaining(),remaining); + buffer.get(bytes,length-remaining+offset,numBytesToCopy); + remaining -= numBytesToCopy; + + //if(remaining > 0) + // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); + // TODO: Assert that we don't copy across a block boundary + } + + // Notify any waiting threads that some of the contents of the buffer were removed. + if(length-remaining > 0) + lock.notify(); + } + +// if(validatingInputStream != null) { +// byte[] validBytes = new byte[length]; +// try { +// validatingInputStream.read(validBytes,offset,length); +// for(int i = offset; i < offset+length; i++) { +// if(bytes[i] != validBytes[i]) +// throw new ReviewedGATKException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); +// } +// } +// catch(IOException ex) { +// throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); +// } +// } + + // If any data was copied into the buffer, return the amount of data copied. + if(remaining < length) + return length - remaining; + + // Otherwise, return -1. + return -1; + } + + public void close() { + if(validatingInputStream != null) { + try { + validatingInputStream.close(); + } + catch(IOException ex) { + throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); + } + } + } + + public String getSource() { + return reader.getSamFilePath(); + } + + private void waitForBufferFill() { + synchronized(lock) { + if(buffer.remaining() == 0 && !eof()) { + //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); + dispatcher.queueBlockLoad(accessPlan); + try { + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedGATKException("Interrupt occurred waiting for buffer to fill",ex); + } + } + } + } + + /** + * Create an encoded BAM file pointer given the address of a BGZF block and an offset. + * @param blockAddress Physical address on disk of a BGZF block. + * @param blockOffset Offset into the uncompressed data stored in the BGZF block. + * @return 64-bit pointer encoded according to the BAM spec. + */ + public static long makeFilePointer(final long blockAddress, final int blockOffset) { + return blockAddress << 16 | blockOffset; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockLoader.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java new file mode 100644 index 000000000..7f6653888 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java @@ -0,0 +1,229 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.FileInputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Caches frequently used file handles. Right now, caches only a single file handle. + * TODO: Generalize to support arbitrary file handle caches. + */ +public class FileHandleCache { + /** + * The underlying data structure storing file handles. + */ + private final FileHandleStorage fileHandleStorage; + + /** + * How many file handles should be kept open at once. + */ + private final int cacheSize; + + /** + * A uniquifier: assign a unique ID to every instance of a file handle. + */ + private final Map keyCounter = new HashMap(); + + /** + * A shared lock, private so that outside users cannot notify it. + */ + private final Object lock = new Object(); + + /** + * Indicates how many file handles are outstanding at this point. + */ + private int numOutstandingFileHandles = 0; + + /** + * Create a new file handle cache of the given cache size. + * @param cacheSize how many readers to hold open at once. + */ + public FileHandleCache(final int cacheSize) { + this.cacheSize = cacheSize; + fileHandleStorage = new FileHandleStorage(); + } + + /** + * Retrieves or opens a file handle for the given reader ID. + * @param key The ke + * @return A file input stream from the cache, if available, or otherwise newly opened. + */ + public FileInputStream claimFileInputStream(final SAMReaderID key) { + synchronized(lock) { + FileInputStream inputStream = findExistingEntry(key); + if(inputStream == null) { + try { + // If the cache is maxed out, wait for another file handle to emerge. + if(numOutstandingFileHandles >= cacheSize) + lock.wait(); + } + catch(InterruptedException ex) { + throw new ReviewedGATKException("Interrupted while waiting for a file handle"); + } + inputStream = openInputStream(key); + } + numOutstandingFileHandles++; + + //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); + return inputStream; + } + } + + /** + * Releases the current reader and returns it to the cache. + * @param key The reader. + * @param inputStream The stream being used. + */ + public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { + synchronized(lock) { + numOutstandingFileHandles--; + UniqueKey newID = allocateKey(key); + fileHandleStorage.put(newID,inputStream); + // Let any listeners know that another file handle has become available. + lock.notify(); + } + } + + /** + * Finds an existing entry in the storage mechanism. + * @param key Reader. + * @return a cached stream, if available. Otherwise, + */ + private FileInputStream findExistingEntry(final SAMReaderID key) { + int existingHandles = getMostRecentUniquifier(key); + + // See if any of the keys currently exist in the repository. + for(int i = 0; i <= existingHandles; i++) { + UniqueKey uniqueKey = new UniqueKey(key,i); + if(fileHandleStorage.containsKey(uniqueKey)) + return fileHandleStorage.remove(uniqueKey); + } + + return null; + } + + /** + * Gets the most recent uniquifier used for the given reader. + * @param reader Reader for which to determine uniqueness. + * @return + */ + private int getMostRecentUniquifier(final SAMReaderID reader) { + if(keyCounter.containsKey(reader)) + return keyCounter.get(reader); + else return -1; + } + + private UniqueKey allocateKey(final SAMReaderID reader) { + int uniquifier = getMostRecentUniquifier(reader)+1; + keyCounter.put(reader,uniquifier); + return new UniqueKey(reader,uniquifier); + } + + private FileInputStream openInputStream(final SAMReaderID reader) { + try { + return new FileInputStream(reader.getSamFilePath()); + } + catch(IOException ex) { + throw new GATKException("Unable to open input file"); + } + } + + private void closeInputStream(final FileInputStream inputStream) { + try { + inputStream.close(); + } + catch(IOException ex) { + throw new GATKException("Unable to open input file"); + } + } + + /** + * Actually contains the file handles, purging them as they get too old. + */ + private class FileHandleStorage extends LinkedHashMap { + /** + * Remove the oldest entry + * @param entry Entry to consider removing. + * @return True if the cache size has been exceeded. False otherwise. + */ + @Override + protected boolean removeEldestEntry(Map.Entry entry) { + synchronized (lock) { + if(size() > cacheSize) { + keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); + closeInputStream(entry.getValue()); + + return true; + } + } + return false; + } + } + + /** + * Uniquifies a key by adding a numerical uniquifier. + */ + private class UniqueKey { + /** + * The file handle's key. + */ + private final SAMReaderID key; + + /** + * A uniquifier, so that multiple of the same reader can exist in the cache. + */ + private final int uniqueID; + + public UniqueKey(final SAMReaderID reader, final int uniqueID) { + this.key = reader; + this.uniqueID = uniqueID; + } + + @Override + public boolean equals(Object other) { + if(!(other instanceof UniqueKey)) + return false; + UniqueKey otherUniqueKey = (UniqueKey)other; + return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; + } + + @Override + public int hashCode() { + return key.hashCode(); + } + } + + + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java new file mode 100644 index 000000000..4ea4aabf9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java @@ -0,0 +1,437 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import htsjdk.samtools.SAMFileSpan; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.*; + +/** + * Represents a small section of a BAM file, and every associated interval. + */ +public class FilePointer { + protected final SortedMap fileSpans = new TreeMap(); + protected final List locations = new ArrayList(); + protected final IntervalMergingRule intervalMergingRule; + + /** + * Does this file pointer point into an unmapped region? + */ + protected final boolean isRegionUnmapped; + + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + */ + private boolean isMonolithic = false; + + /** + * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers + */ + private Integer contigIndex = null; + + + public FilePointer( final IntervalMergingRule mergeRule, final List locations ) { + this.intervalMergingRule = mergeRule; + this.locations.addAll(locations); + this.isRegionUnmapped = checkUnmappedStatus(); + + validateAllLocations(); + if ( locations.size() > 0 ) { + contigIndex = locations.get(0).getContigIndex(); + } + } + + public FilePointer( final IntervalMergingRule mergeRule, final GenomeLoc... locations ) { + this(mergeRule, Arrays.asList(locations)); + } + + public FilePointer( final Map fileSpans, final IntervalMergingRule mergeRule, final List locations ) { + this(mergeRule, locations); + this.fileSpans.putAll(fileSpans); + } + + private boolean checkUnmappedStatus() { + boolean foundMapped = false, foundUnmapped = false; + + for( GenomeLoc location: locations ) { + if ( GenomeLoc.isUnmapped(location) ) + foundUnmapped = true; + else + foundMapped = true; + } + if ( foundMapped && foundUnmapped ) + throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); + + return foundUnmapped; + } + + private void validateAllLocations() { + // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction + if ( isRegionUnmapped || isMonolithic ) { + return; + } + + Integer previousContigIndex = null; + + for ( GenomeLoc location : locations ) { + if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) { + throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); + } + + previousContigIndex = location.getContigIndex(); + } + } + + private void validateLocation( GenomeLoc location ) { + if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) { + throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); + } + if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) { + throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); + } + } + + /** + * Returns an immutable view of this FilePointer's file spans + * + * @return an immutable view of this FilePointer's file spans + */ + public Map getFileSpans() { + return Collections.unmodifiableMap(fileSpans); + } + + /** + * Returns an immutable variant of the list of locations. + * @return + */ + public List getLocations() { + return Collections.unmodifiableList(locations); + } + + /** + * Returns the index of the contig into which this FilePointer points (a FilePointer can represent + * regions in at most one contig). + * + * @return the index of the contig into which this FilePointer points + */ + public int getContigIndex() { + return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + } + + /** + * Returns the IntervalMergingRule used by this FilePointer to merge adjacent locations + * + * @return the IntervalMergingRule used by this FilePointer (never null) + */ + public IntervalMergingRule getIntervalMergingRule() { + return intervalMergingRule; + } + + /** + * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will + * ever visit during this GATK run? If this is set to true, the engine will expect to see only this + * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals + * from more than one contig. + * + * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false + */ + public boolean isMonolithic() { + return isMonolithic; + } + + /** + * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all + * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic + * FP may contain intervals from more than one contig. + * + * @param isMonolithic set this FP's monolithic status to this value + */ + public void setIsMonolithic( boolean isMonolithic ) { + this.isMonolithic = isMonolithic; + } + + @Override + public boolean equals(final Object other) { + if(!(other instanceof FilePointer)) + return false; + FilePointer otherFilePointer = (FilePointer)other; + + // intervals + if(this.locations.size() != otherFilePointer.locations.size()) + return false; + for(int i = 0; i < locations.size(); i++) { + if(!this.locations.get(i).equals(otherFilePointer.locations.get(i))) + return false; + } + + // fileSpans + if(this.fileSpans.size() != otherFilePointer.fileSpans.size()) + return false; + Iterator> thisEntries = this.fileSpans.entrySet().iterator(); + Iterator> otherEntries = otherFilePointer.fileSpans.entrySet().iterator(); + while(thisEntries.hasNext() || otherEntries.hasNext()) { + if(!thisEntries.next().equals(otherEntries.next())) + return false; + } + + return true; + } + + public void addLocation(final GenomeLoc location) { + validateLocation(location); + + this.locations.add(location); + if ( contigIndex == null ) { + contigIndex = location.getContigIndex(); + } + } + + public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { + this.fileSpans.put(id,fileSpan); + } + + public void addFileSpans(final Map fileSpans) { + this.fileSpans.putAll(fileSpans); + } + + + /** + * Computes the size of this file span, in uncompressed bytes. + * @return Size of the file span. + */ + public long size() { + long size = 0L; + for(SAMFileSpan fileSpan: fileSpans.values()) + size += ((GATKBAMFileSpan)fileSpan).size(); + return size; + } + + /** + * Returns the difference in size between two filespans. + * @param other Other filespan against which to measure. + * @return The difference in size between the two file pointers. + */ + public long minus(final FilePointer other) { + long difference = 0; + PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); + PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); + + while(thisIterator.hasNext()) { + // If there are no elements left in the 'other' iterator, spin out this iterator. + if(!otherIterator.hasNext()) { + GATKBAMFileSpan nextSpan = (GATKBAMFileSpan)thisIterator.next().getValue(); + difference += nextSpan.size(); + continue; + } + + // Otherwise, compare the latest value. + int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); + + if(compareValue < 0) { + // This before other. + difference += ((GATKBAMFileSpan)thisIterator.next().getValue()).size(); + } + else if(compareValue > 0) { + // Other before this. + difference += ((GATKBAMFileSpan)otherIterator.next().getValue()).size(); + } + else { + // equality; difference the values. + GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue(); + GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue(); + difference += Math.abs(thisRegion.minus(otherRegion).size()); + } + } + return difference; + } + + /** + * Combines two file pointers into one. + * @param parser The genomelocparser to use when manipulating intervals. + * @param other File pointer to combine into this one. + * @return A completely new file pointer that is the combination of the two. + */ + public FilePointer combine(final GenomeLocParser parser, final FilePointer other) { + FilePointer combined = new FilePointer(intervalMergingRule); + + List intervals = new ArrayList(); + intervals.addAll(locations); + intervals.addAll(other.locations); + for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,intervalMergingRule)) + combined.addLocation(interval); + + PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); + PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); + + while(thisIterator.hasNext() || otherIterator.hasNext()) { + int compareValue; + if(!otherIterator.hasNext()) { + compareValue = -1; + } + else if(!thisIterator.hasNext()) + compareValue = 1; + else + compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); + + // This before other. + if(compareValue < 0) + mergeElementsInto(combined,thisIterator); + // Other before this. + else if(compareValue > 0) + mergeElementsInto(combined,otherIterator); + // equality; union the values. + else + mergeElementsInto(combined,thisIterator,otherIterator); + } + return combined; + } + + /** + * Roll the next element in the iterator into the combined entry. + * @param combined Entry into which to roll the next element. + * @param iterators Sources of next elements. + */ + private void mergeElementsInto(final FilePointer combined, Iterator>... iterators) { + if(iterators.length == 0) + throw new ReviewedGATKException("Tried to add zero elements to an existing file pointer."); + Map.Entry initialElement = iterators[0].next(); + GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)initialElement.getValue(); + for(int i = 1; i < iterators.length; i++) + fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); + combined.addFileSpans(initialElement.getKey(),fileSpan); + } + + /** + * Efficiently generate the union of the n FilePointers passed in. Much more efficient than + * combining two FilePointers at a time using the combine() method above. + * + * IMPORTANT: the FilePointers to be unioned must either all represent regions on the + * same contig, or all be unmapped, since we cannot create FilePointers with a mix of + * contigs or with mixed mapped/unmapped regions. + * + * @param filePointers the FilePointers to union + * @param parser our GenomeLocParser + * @return the union of the FilePointers passed in + */ + public static FilePointer union( List filePointers, GenomeLocParser parser ) { + if ( filePointers == null || filePointers.isEmpty() ) { + return new FilePointer(IntervalMergingRule.ALL); + } + + Map> fileChunks = new HashMap>(); + List locations = new ArrayList(); + IntervalMergingRule mergeRule = filePointers.get(0).getIntervalMergingRule(); + + // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections + for ( FilePointer filePointer : filePointers ) { + locations.addAll(filePointer.getLocations()); + if (mergeRule != filePointer.getIntervalMergingRule()) + throw new ReviewedGATKException("All FilePointers in FilePointer.union() must have use the same IntervalMergeRule"); + + for ( Map.Entry fileSpanEntry : filePointer.getFileSpans().entrySet() ) { + GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue(); + + if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) { + fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks()); + } + else { + fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks()); + } + } + } + + // Now sort and merge the intervals + List sortedMergedLocations = new ArrayList(); + sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, mergeRule)); + + // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing + // the sorted, merged union of the chunks for that file + Map mergedFileSpans = new HashMap(fileChunks.size()); + for ( Map.Entry> fileChunksEntry : fileChunks.entrySet() ) { + List unmergedChunks = fileChunksEntry.getValue(); + mergedFileSpans.put(fileChunksEntry.getKey(), + (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan())); + } + + return new FilePointer(mergedFileSpans, mergeRule, sortedMergedLocations); + } + + /** + * Returns true if any of the file spans in this FilePointer overlap their counterparts in + * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region + * from the start of the first chunk to the end of the last chunk). + * + * @param other the FilePointer against which to check overlap with this FilePointer + * @return true if any file spans overlap their counterparts in other, otherwise false + */ + public boolean hasFileSpansOverlappingWith( FilePointer other ) { + for ( Map.Entry thisFilePointerEntry : fileSpans.entrySet() ) { + GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue()); + + SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey()); + if ( otherEntry == null ) { + continue; // no counterpart for this file span in other + } + GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry); + + if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) { + return true; + } + } + + return false; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("FilePointer:\n"); + builder.append("\tlocations = {"); + builder.append(Utils.join(";",locations)); + builder.append("}\n\tregions = \n"); + for(Map.Entry entry: fileSpans.entrySet()) { + builder.append(entry.getKey()); + builder.append("= {"); + builder.append(entry.getValue()); + builder.append("}"); + } + return builder.toString(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndex.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexData.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalSharder.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java new file mode 100644 index 000000000..4714df9b7 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.SAMFileSpan; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.List; +import java.util.Map; + +/** + * Handles locus shards of BAM information. + * @author aaron + * @version 1.0 + * @date Apr 7, 2009 + */ +public class LocusShard extends Shard { + /** + * Create a new locus shard, divided by index. + * @param intervals List of intervals to process. + * @param fileSpans File spans associated with that interval. + */ + public LocusShard(GenomeLocParser parser, SAMDataSource dataSource, List intervals, Map fileSpans) { + super(parser, ShardType.LOCUS, intervals, dataSource, fileSpans, false); + } + + /** + * String representation of this shard. + * @return A string representation of the boundaries of this shard. + */ + @Override + public String toString() { + return Utils.join(";",getGenomeLocs()); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShardBalancer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java new file mode 100644 index 000000000..d8ae3bf55 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java @@ -0,0 +1,271 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.*; + +/** + * + * User: aaron + * Date: Apr 10, 2009 + * Time: 5:03:13 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + +/** + * Expresses a shard of read data in block format. + * + * @author mhanna + * @version 0.1 + */ +public class ReadShard extends Shard { + + /** + * Default read shard buffer size + */ + public static final int DEFAULT_MAX_READS = 10000; + + /** + * What is the maximum number of reads per BAM file which should go into a read shard. + * + * TODO: this non-final static variable should either be made final or turned into an + * TODO: instance variable somewhere -- as both static and mutable it wreaks havoc + * TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource + * TODO: changes this value) + */ + public static int MAX_READS = DEFAULT_MAX_READS; + + /** + * The reads making up this shard. + */ + private final Collection reads = new ArrayList(MAX_READS); + + public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { + super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); + } + + /** + * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface + * until we know what effect tuning this parameter has. + * + * TODO: this mutable static interface is awful and breaks tests -- need to refactor + * + * @param bufferSize New maximum number + */ + static void setReadBufferSize(final int bufferSize) { + MAX_READS = bufferSize; + } + + /** + * What read buffer size are we using? + * + * @return + */ + public static int getReadBufferSize() { + return MAX_READS; + } + + /** + * Returns true if this shard is meant to buffer reads, rather + * than just holding pointers to their locations. + * @return True if this shard can buffer reads. False otherwise. + */ + public boolean buffersReads() { + return true; + } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferEmpty() { + return reads.size() == 0; + } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferFull() { + return reads.size() > ReadShard.MAX_READS; + } + + /** + * Adds a read to the read buffer. + * @param read Add a read to the internal shard buffer. + */ + public void addRead(SAMRecord read) { + // DO NOT validate that the buffer is full. Paired read sharding will occasionally have to stuff another + // read or two into the buffer. + reads.add(read); + } + + /** + * Fills this shard's buffer with reads from the iterator passed in + * + * @param readIter Iterator from which to draw the reads to fill the shard + */ + @Override + public void fill( PeekableIterator readIter ) { + if( ! buffersReads() ) + throw new ReviewedGATKException("Attempting to fill a non-buffering shard."); + + SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder(); + SAMRecord read = null; + + while( ! isBufferFull() && readIter.hasNext() ) { + final SAMRecord nextRead = readIter.peek(); + if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) { + // only add reads to the shard if they are on the same contig + read = readIter.next(); + addRead(read); + } else { + break; + } + } + + // If the reads are sorted in coordinate order, ensure that all reads + // having the same alignment start become part of the same shard, to allow + // downsampling to work better across shard boundaries. Note that because our + // read stream has already been fed through the positional downsampler, which + // ensures that at each alignment start position there are no more than dcov + // reads, we're in no danger of accidentally creating a disproportionately huge + // shard + if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) { + while ( readIter.hasNext() ) { + SAMRecord additionalRead = readIter.peek(); + + // Stop filling the shard as soon as we encounter a read having a different + // alignment start or contig from the last read added in the earlier loop + // above, or an unmapped read + if ( read == null || + additionalRead.getReadUnmappedFlag() || + ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || + additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { + break; + } + + addRead(readIter.next()); + } + } + + // If the reads are sorted in queryname order, ensure that all reads + // having the same queryname become part of the same shard. + if( sortOrder == SAMFileHeader.SortOrder.queryname ) { + while( readIter.hasNext() ) { + SAMRecord nextRead = readIter.peek(); + if( read == null || ! read.getReadName().equals(nextRead.getReadName()) ) + break; + addRead(readIter.next()); + } + } + } + + /** + * Creates an iterator over reads stored in this shard's read cache. + * @return + */ + public GATKSAMIterator iterator() { + return GATKSAMIteratorAdapter.adapt(reads.iterator()); + } + + /** + * String representation of this shard. + * @return A string representation of the boundaries of this shard. + */ + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + for(Map.Entry entry: getFileSpans().entrySet()) { + sb.append(entry.getKey()); + sb.append(": "); + sb.append(entry.getValue()); + sb.append(' '); + } + return sb.toString(); + } + + /** + * Get the full span from the start of the left most read to the end of the right most one + * + * Note this may be different than the getLocation() of the shard, as this reflects the + * targeted span, not the actual span of reads + * + * @return the genome loc representing the span of these reads on the genome + */ + public GenomeLoc getReadsSpan() { + if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() ) + return super.getLocation(); + else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; + boolean foundMapped = false; + + for ( final SAMRecord read : reads ) { + if ( contig != null && ! read.getReferenceName().equals(contig) ) + throw new ReviewedGATKException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " + + "First contig is " + contig + " next read was " + read.getReferenceName() ); + contig = read.getReferenceName(); + + // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates + // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries, + // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment + // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only* + // with unmapped mates. + if ( ! read.getReadUnmappedFlag() ) { + foundMapped = true; + if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); + if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); + } + } + + assert contig != null; + + if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped + return GenomeLoc.UNMAPPED; + else + return parser.createGenomeLoc(contig, start, stop); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java new file mode 100644 index 000000000..4c4759b26 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java @@ -0,0 +1,1180 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.MergingSamRecordIterator; +import htsjdk.samtools.SamFileHeaderMerger; +import htsjdk.samtools.*; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.RuntimeIOException; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.ReadMetrics; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.filters.CountingFilteringIterator; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.*; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.SimpleTimer; +import org.broadinstitute.gatk.engine.iterators.ReadTransformingIterator; +import org.broadinstitute.gatk.utils.downsampling.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.File; +import java.util.*; +import java.util.concurrent.Callable; + +/** + * User: aaron + * Date: Mar 26, 2009 + * Time: 2:36:16 PM + *

+ * Converts shards to SAM iterators over the specified region + */ +public class SAMDataSource { + final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); + + /** Backing support for reads. */ + protected final ReadProperties readProperties; + + /** + * Runtime metrics of reads filtered, etc. + */ + private final ReadMetrics readMetrics; + + /** + * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. + */ + protected final GenomeLocParser genomeLocParser; + + /** + * Identifiers for the readers driving this data source. + */ + private final Collection readerIDs; + + /** + * How strict are the readers driving this data source. + */ + private final ValidationStringency validationStringency; + + /** + * Do we want to remove the program records from this data source? + */ + private final boolean removeProgramRecords; + + /** + * Store BAM indices for each reader present. + */ + private final Map bamIndices = new HashMap(); + + /** + * The merged header. + */ + private final SAMFileHeader mergedHeader; + + /** + * The constituent headers of the unmerged files. + */ + private final Map headers = new HashMap(); + + /** + * The sort order of the BAM files. Files without a sort order tag are assumed to be + * in coordinate order. + */ + private SAMFileHeader.SortOrder sortOrder = null; + + /** + * Whether the read groups in overlapping files collide. + */ + private final boolean hasReadGroupCollisions; + + /** + * Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids + * are always unique, we can simply use a map here, no need to stratify by reader. + */ + private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping(); + + /** + * Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified + * by readers, since there can be readgroup id collision: different bam files (readers) can list the + * same read group id, which will be disambiguated when these input streams are merged. + */ + private final Map originalToMergedReadGroupMappings = new HashMap(); + + /** + * Mapping from input file path to new sample name. Used only when doing on-the-fly sample renaming. + */ + private Map sampleRenameMap = null; + + /** our log, which we want to capture anything from this class */ + private static Logger logger = Logger.getLogger(SAMDataSource.class); + + /** + * A collection of readers driving the merging process. + */ + private final SAMResourcePool resourcePool; + + /** + * Asynchronously loads BGZF blocks. + */ + private final BGZFBlockLoadingDispatcher dispatcher; + + /** + * How are threads allocated. + */ + private final ThreadAllocation threadAllocation; + + /** + * How are adjacent intervals merged by the sharder? + */ + private final IntervalMergingRule intervalMergingRule; + + /** + * Static set of unsupported programs that create bam files. + * The key is the PG record ID and the value is the name of the tool that created it + */ + private static Map unsupportedPGs = new HashMap<>(); + static { + unsupportedPGs.put("GATK ReduceReads", "ReduceReads"); + } + + /** + * Create a new SAM data source given the supplied read metadata. + * + * For testing purposes + * + * @param samFiles list of reads files. + */ + public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { + this( + samFiles, + threadAllocation, + numFileHandles, + genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + } + + /** + * See complete constructor. Does not enable BAQ by default. + * + * For testing purposes + */ + public SAMDataSource( + Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, + GenomeLocParser genomeLocParser, + boolean useOriginalBaseQualities, + ValidationStringency strictness, + Integer readBufferSize, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + boolean includeReadsWithDeletionAtLoci) { + this( samFiles, + threadAllocation, + numFileHandles, + genomeLocParser, + useOriginalBaseQualities, + strictness, + readBufferSize, + downsamplingMethod, + exclusionList, + supplementalFilters, + Collections.emptyList(), + includeReadsWithDeletionAtLoci, + (byte) -1, + false, + false, + null, + IntervalMergingRule.ALL); + } + + /** + * Create a new SAM data source given the supplied read metadata. + * @param samFiles list of reads files. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param strictness Stringency of reads file parsing. + * @param readBufferSize Number of reads to hold in memory per BAM. + * @param downsamplingMethod Method for downsampling reads at a given locus. + * @param exclusionList what safety checks we're willing to let slide + * @param supplementalFilters additional filters to dynamically apply. + * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method + * will explicitly list reads with deletion over the current reference base; otherwise, only observed + * bases will be seen in the pileups, and the deletions will be skipped silently. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? + * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. + * Will be null if we're not doing sample renaming. + * @param intervalMergingRule how are adjacent intervals merged by the sharder + */ + public SAMDataSource( + Collection samFiles, + ThreadAllocation threadAllocation, + Integer numFileHandles, + GenomeLocParser genomeLocParser, + boolean useOriginalBaseQualities, + ValidationStringency strictness, + Integer readBufferSize, + DownsamplingMethod downsamplingMethod, + ValidationExclusion exclusionList, + Collection supplementalFilters, + List readTransformers, + boolean includeReadsWithDeletionAtLoci, + byte defaultBaseQualities, + boolean removeProgramRecords, + final boolean keepReadsInLIBS, + final Map sampleRenameMap, + final IntervalMergingRule intervalMergingRule) { + + this.readMetrics = new ReadMetrics(); + this.genomeLocParser = genomeLocParser; + this.intervalMergingRule = intervalMergingRule; + + readerIDs = samFiles; + + this.threadAllocation = threadAllocation; + // TODO: Consider a borrowed-thread dispatcher implementation. + if(this.threadAllocation.getNumIOThreads() > 0) { + logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads()); + dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); + } + else + dispatcher = null; + + validationStringency = strictness; + this.removeProgramRecords = removeProgramRecords; + if(readBufferSize != null) + ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests + else { + // Choose a sensible default for the read buffer size. + // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once. + // Now we are simply setting it to 100K reads + ReadShard.setReadBufferSize(100000); + } + + this.sampleRenameMap = sampleRenameMap; + + resourcePool = new SAMResourcePool(Integer.MAX_VALUE); + SAMReaders readers = resourcePool.getAvailableReaders(); + + // Determine the sort order. + for(SAMReaderID readerID: readerIDs) { + if (! readerID.getSamFile().canRead() ) + throw new UserException.CouldNotReadInputFile(readerID.getSamFile(),"file is not present or user does not have appropriate permissions. " + + "Please check that the file is present and readable and try again."); + + // Get the sort order, forcing it to coordinate if unsorted. + SAMFileReader reader = readers.getReader(readerID); + SAMFileHeader header = reader.getFileHeader(); + + headers.put(readerID,header); + + if ( header.getReadGroups().isEmpty() ) { + throw new UserException.MalformedBAM(readers.getReaderID(reader).getSamFile(), + "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); + } + + SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate; + + // Validate that all input files are sorted in the same order. + if(this.sortOrder != null && this.sortOrder != sortOrder) + throw new UserException.MissortedBAM(String.format("Attempted to process mixed of files sorted as %s and %s.",this.sortOrder,sortOrder)); + + // Update the sort order. + this.sortOrder = sortOrder; + } + + mergedHeader = readers.getMergedHeader(); + hasReadGroupCollisions = readers.hasReadGroupCollisions(); + + readProperties = new ReadProperties( + samFiles, + mergedHeader, + sortOrder, + useOriginalBaseQualities, + strictness, + downsamplingMethod, + exclusionList, + supplementalFilters, + readTransformers, + includeReadsWithDeletionAtLoci, + defaultBaseQualities, + keepReadsInLIBS); + + // cache the read group id (original) -> read group id (merged) + // and read group id (merged) -> read group id (original) mappings. + for(SAMReaderID id: readerIDs) { + SAMFileReader reader = readers.getReader(id); + + ReadGroupMapping mappingToMerged = new ReadGroupMapping(); + + List readGroups = reader.getFileHeader().getReadGroups(); + for(SAMReadGroupRecord readGroup: readGroups) { + if(hasReadGroupCollisions) { + mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId())); + mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId()); + } else { + mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); + } + } + + originalToMergedReadGroupMappings.put(id,mappingToMerged); + } + + for(SAMReaderID id: readerIDs) { + File indexFile = findIndexFile(id.getSamFile()); + if(indexFile != null) + bamIndices.put(id,new GATKBAMIndex(indexFile)); + } + + resourcePool.releaseReaders(readers); + } + + /** + * Checks whether the provided SAM header if from a reduced bam file. + * @param header the SAM header for a given file + * @throws UserException if the header is from a reduced bam + */ + private void checkForUnsupportedBamFile(final SAMFileHeader header) { + for ( final SAMProgramRecord PGrecord : header.getProgramRecords() ) { + if ( unsupportedPGs.containsKey(PGrecord.getId()) ) + throw new UserException("The GATK no longer supports running off of BAMs produced by " + unsupportedPGs.get(PGrecord.getId())); + } + } + + public void close() { + SAMReaders readers = resourcePool.getAvailableReaders(); + for(SAMReaderID readerID: readerIDs) { + SAMFileReader reader = readers.getReader(readerID); + reader.close(); + } + } + + /** + * Returns Reads data structure containing information about the reads data sources placed in this pool as well as + * information about how they are downsampled, sorted, and filtered + * @return + */ + public ReadProperties getReadsInfo() { return readProperties; } + + /** + * Checks to see whether any reads files are supplying data. + * @return True if no reads files are supplying data to the traversal; false otherwise. + */ + public boolean isEmpty() { + return readProperties.getSAMReaderIDs().size() == 0; + } + + /** + * Gets the SAM file associated with a given reader ID. + * @param id The reader for which to retrieve the source file. + * @return the file actually associated with the id. + */ + public File getSAMFile(SAMReaderID id) { + return id.getSamFile(); + } + + /** + * Returns readers used by this data source. + * @return A list of SAM reader IDs. + */ + public Collection getReaderIDs() { + return readerIDs; + } + + /** + * Retrieves the id of the reader which built the given read. + * @param read The read to test. + * @return ID of the reader. + */ + public SAMReaderID getReaderID(SAMRecord read) { + return resourcePool.getReaderID(read.getFileSource().getReader()); + } + + /** + * Gets the merged header from the SAM file. + * @return The merged header. + */ + public SAMFileHeader getHeader() { + return mergedHeader; + } + + public SAMFileHeader getHeader(SAMReaderID id) { + return headers.get(id); + } + + /** + * Gets the revised read group id mapped to this 'original' read group id. + * @param reader for which to grab a read group. + * @param originalReadGroupId ID of the original read group. + * @return Merged read group ID. + */ + public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { + return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId); + } + + /** + * Gets the original read group id (as it was specified in the original input bam file) that maps onto + * this 'merged' read group id. + * @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream). + * @return Merged read group ID. + */ + public String getOriginalReadGroupId(final String mergedReadGroupId) { + return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); + } + + /** + * True if all readers have an index. + * @return True if all readers have an index. + */ + public boolean hasIndex() { + return readerIDs.size() == bamIndices.size(); + } + + /** + * Gets the index for a particular reader. Always preloaded. + * @param id Id of the reader. + * @return The index. Will preload the index if necessary. + */ + public GATKBAMIndex getIndex(final SAMReaderID id) { + return bamIndices.get(id); + } + + /** + * Retrieves the sort order of the readers. + * @return Sort order. Can be unsorted, coordinate order, or query name order. + */ + public SAMFileHeader.SortOrder getSortOrder() { + return sortOrder; + } + + /** + * Gets the cumulative read metrics for shards already processed. + * @return Cumulative read metrics. + */ + public ReadMetrics getCumulativeReadMetrics() { + // don't return a clone here because the engine uses a pointer to this object + return readMetrics; + } + + /** + * Incorporate the given read metrics into the cumulative read metrics. + * @param readMetrics The 'incremental' read metrics, to be incorporated into the cumulative metrics. + */ + public void incorporateReadMetrics(final ReadMetrics readMetrics) { + this.readMetrics.incrementMetrics(readMetrics); + } + + public GATKSAMIterator seek(Shard shard) { + if(shard.buffersReads()) { + return shard.iterator(); + } + else { + return getIterator(shard); + } + } + + /** + * Gets the reader associated with the given read. + * @param readers Available readers. + * @param read + * @return + */ + private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { + for(SAMReaderID id: getReaderIDs()) { + if(readers.getReader(id) == read.getFileSource().getReader()) + return id; + } + throw new ReviewedGATKException("Unable to find id for reader associated with read " + read.getReadName()); + } + + /** + * Get the initial reader positions across all BAM files + * + * @return the start positions of the first chunk of reads for all BAM files + */ + protected Map getInitialReaderPositions() { + Map initialPositions = new HashMap(); + SAMReaders readers = resourcePool.getAvailableReaders(); + + for ( SAMReaderID id: getReaderIDs() ) { + initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); + } + + resourcePool.releaseReaders(readers); + return initialPositions; + } + + /** + * Get an iterator over the data types specified in the shard. + * + * @param shard The shard specifying the data limits. + * @return An iterator over the selected data. + */ + protected GATKSAMIterator getIterator( Shard shard ) { + return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); + } + + /** + * Get an iterator over the data types specified in the shard. + * @param readers Readers from which to load data. + * @param shard The shard specifying the data limits. + * @param enableVerification True to verify. For compatibility with old sharding strategy. + * @return An iterator over the selected data. + */ + private GATKSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { + // Set up merging to dynamically merge together multiple BAMs. + Map> iteratorMap = new HashMap>(); + + for(SAMReaderID id: getReaderIDs()) { + CloseableIterator iterator = null; + + // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. + // TODO: Kill this check once we've proven that the design elements are gone. + if(shard.getFileSpans().get(id) == null) + throw new ReviewedGATKException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); + + try { + if(threadAllocation.getNumIOThreads() > 0) { + BlockInputStream inputStream = readers.getInputStream(id); + inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); + BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); + codec.setInputStream(inputStream); + iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); + } + else { + iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); + } + } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes + throw new UserException.MalformedBAM(id.getSamFile(), e.getMessage()); + } + + iterator = new MalformedBAMErrorReformatingIterator(id.getSamFile(), iterator); + if(shard.getGenomeLocs().size() > 0) + iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); + + iteratorMap.put(readers.getReader(id), iterator); + } + + MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); + + // The readMetrics object being passed in should be that of this dataSource and NOT the shard: the dataSource's + // metrics is intended to keep track of the reads seen (and hence passed to the CountingFilteringIterator when + // we apply the decorators), whereas the shard's metrics is used to keep track the "records" seen. + return applyDecoratingIterators(readMetrics, + enableVerification, + readProperties.useOriginalBaseQualities(), + new ReleasingIterator(readers, GATKSAMIteratorAdapter.adapt(mergingIterator)), + readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), + readProperties.getSupplementalFilters(), + readProperties.getReadTransformers(), + readProperties.defaultBaseQualities(), + shard instanceof LocusShard); + } + + private class BAMCodecIterator implements CloseableIterator { + private final BlockInputStream inputStream; + private final SAMFileReader reader; + private final BAMRecordCodec codec; + private SAMRecord nextRead; + + private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) { + this.inputStream = inputStream; + this.reader = reader; + this.codec = codec; + advance(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if(!hasNext()) + throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); + SAMRecord currentRead = nextRead; + advance(); + return currentRead; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); + } + + private void advance() { + final long startCoordinate = inputStream.getFilePointer(); + nextRead = codec.decode(); + final long stopCoordinate = inputStream.getFilePointer(); + + if(reader != null && nextRead != null) + PicardNamespaceUtils.setFileSource(nextRead, new SAMFileSource(reader, new GATKBAMFileSpan(new GATKChunk(startCoordinate, stopCoordinate)))); + } + } + + /** + * Filter reads based on user-specified criteria. + * + * @param readMetrics metrics to track when using this iterator. + * @param enableVerification Verify the order of reads. + * @param useOriginalBaseQualities True if original base qualities should be used. + * @param wrappedIterator the raw data source. + * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. + * @param supplementalFilters additional filters to apply to the reads. + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard + * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. + */ + protected GATKSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, + boolean enableVerification, + boolean useOriginalBaseQualities, + GATKSAMIterator wrappedIterator, + Boolean noValidationOfReadOrder, + Collection supplementalFilters, + List readTransformers, + byte defaultBaseQualities, + boolean isLocusBasedTraversal ) { + + // Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum, + // this will consolidate the cigar strings into canonical form. This has to be done before the read + // filtering, because not all read filters will behave correctly with things like zero-length cigar + // elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also + // modify the base qualities. + wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); + + // Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads + // that actually survive filtering. Otherwise we could get much less coverage than requested. + wrappedIterator = GATKSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + + // Downsampling: + + // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers + // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding + // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling + // of individual reads. + boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && + readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readProperties.getDownsamplingMethod().toCoverage != null; + + // Apply downsampling iterators here only in cases where we know that LocusIteratorByState won't be + // doing any downsampling downstream of us + if ( ! assumeDownstreamLIBSDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } + + // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, + // verify the read ordering by applying a sort order iterator + if (!noValidationOfReadOrder && enableVerification) + wrappedIterator = new VerifyingSamIterator(wrappedIterator); + + // Read transformers: these are applied last, so that we don't bother transforming reads that get discarded + // by the read filters or downsampler. + for ( final ReadTransformer readTransformer : readTransformers ) { + if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) + wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); + } + + return wrappedIterator; + } + + protected GATKSAMIterator applyDownsamplingIterator( GATKSAMIterator wrappedIterator ) { + if ( readProperties.getDownsamplingMethod() == null || + readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { + return wrappedIterator; + } + + if ( readProperties.getDownsamplingMethod().toFraction != null ) { + + // If we're downsampling to a fraction of reads, there's no point in paying the cost of + // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on + // reads from each sample separately, since the result would be the same as running the + // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator + // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling + // was requested. + + return new DownsamplingReadsIterator(wrappedIterator, + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); + } + else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { + + // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling + // the read stream to run the downsampler on the reads for each individual sample separately if + // BY_SAMPLE downsampling was requested. + + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + return new PerSampleDownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + return new DownsamplingReadsIterator(wrappedIterator, + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); + } + } + + return wrappedIterator; + } + + + private class SAMResourcePool { + /** + * How many entries can be cached in this resource pool? + */ + private final int maxEntries; + + /** + * All iterators of this reference-ordered data. + */ + private List allResources = new ArrayList(); + + /** + * All iterators that are not currently in service. + */ + private List availableResources = new ArrayList(); + + public SAMResourcePool(final int maxEntries) { + this.maxEntries = maxEntries; + } + + /** + * Choose a set of readers from the pool to use for this query. When complete, + * @return + */ + public synchronized SAMReaders getAvailableReaders() { + if(availableResources.size() == 0) + createNewResource(); + SAMReaders readers = availableResources.get(0); + availableResources.remove(readers); + return readers; + } + + public synchronized void releaseReaders(SAMReaders readers) { + if(!allResources.contains(readers)) + throw new ReviewedGATKException("Tried to return readers from the pool that didn't originate in the pool."); + availableResources.add(readers); + } + + /** + * Gets the reader id for the given reader. + * @param reader Reader for which to determine the id. + * @return id of the given reader. + */ + protected synchronized SAMReaderID getReaderID(SamReader reader) { + for(SAMReaders readers: allResources) { + SAMReaderID id = readers.getReaderID(reader); + if(id != null) + return id; + } + throw new ReviewedGATKException("No such reader id is available"); + } + + private synchronized void createNewResource() { + if(allResources.size() > maxEntries) + throw new ReviewedGATKException("Cannot create a new resource pool. All resources are in use."); + SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords); + allResources.add(readers); + availableResources.add(readers); + } + + } + + /** + * A collection of readers derived from a reads metadata structure. + */ + private class SAMReaders implements Iterable { + /** + * Cached representation of the merged header used to generate a merging iterator. + */ + private final SamFileHeaderMerger headerMerger; + + /** + * Internal storage for a map of id -> reader. + */ + private final Map readers = new LinkedHashMap(); + + /** + * The inptu streams backing + */ + private final Map inputStreams = new LinkedHashMap(); + + /** + * Derive a new set of readers from the Reads metadata. + * @param readerIDs reads to load. + * TODO: validationStringency is not used here + * @param validationStringency validation stringency. + * @param removeProgramRecords indicate whether to clear program records from the readers + */ + public SAMReaders(Collection readerIDs, ValidationStringency validationStringency, boolean removeProgramRecords) { + final int totalNumberOfFiles = readerIDs.size(); + int readerNumber = 1; + final SimpleTimer timer = new SimpleTimer().start(); + + if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial"); + final int tickSize = 50; + int nExecutedTotal = 0; + long lastTick = timer.currentTime(); + for(final SAMReaderID readerID: readerIDs) { + final ReaderInitializer init = new ReaderInitializer(readerID).call(); + + checkForUnsupportedBamFile(init.reader.getFileHeader()); + + if (removeProgramRecords) { + init.reader.getFileHeader().setProgramRecords(new ArrayList()); + } + + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.getSamFile())); + readers.put(init.readerID,init.reader); + if ( ++nExecutedTotal % tickSize == 0) { + double tickInSec = (timer.currentTime() - lastTick) / 1000.0; + printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); + lastTick = timer.currentTime(); + } + } + + if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); + + Collection headers = new LinkedList(); + + // Examine the bam headers, perform any requested sample renaming on them, and add + // them to the list of headers to pass to the Picard SamFileHeaderMerger: + for ( final Map.Entry readerEntry : readers.entrySet() ) { + final SAMReaderID readerID = readerEntry.getKey(); + final SAMFileReader reader = readerEntry.getValue(); + final SAMFileHeader header = reader.getFileHeader(); + + // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, + // or the user's sample rename map file didn't contain an entry for this bam file: + final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID.getSamFilePath()) : null; + + // If we've been asked to rename the sample for this bam file, do so now. We'll check to + // make sure this bam only contains reads from one sample before proceeding. + // + // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of + // the existing read group attributes (including sample name) when merging + // headers, regardless of whether there are read group collisions or not. + if ( remappedSampleName != null ) { + remapSampleName(readerID, header, remappedSampleName); + } + + headers.add(header); + } + + headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); + + // update all read groups to GATKSAMRecordReadGroups + final List gatkReadGroups = new LinkedList(); + for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) { + gatkReadGroups.add(new GATKSAMReadGroupRecord(rg)); + } + headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); + } + + /** + * Changes the sample name in the read groups for the provided bam file header to match the + * remappedSampleName. Blows up with a UserException if the header contains more than one + * sample name. + * + * @param readerID ID for the bam file from which the provided header came from + * @param header The bam file header. Will be modified by this call. + * @param remappedSampleName New sample name to replace the existing sample attribute in the + * read groups for the header. + */ + private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { + String firstEncounteredSample = null; + + for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { + final String thisReadGroupSample = readGroup.getSample(); + + if ( thisReadGroupSample == null ) { + throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + + "bam file contains a read group (id: %s) with a null sample attribute", + readerID.getSamFilePath(), readGroup.getId())); + } + else if ( firstEncounteredSample == null ) { + firstEncounteredSample = thisReadGroupSample; + } + else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { + throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + + "however this bam file contains reads from more than one sample " + + "(encountered samples %s and %s in the bam header). The GATK requires that " + + "all bams for which on-the-fly sample renaming is requested " + + "contain reads from only a single sample per bam.", + readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); + } + + readGroup.setSample(remappedSampleName); + } + } + + final private void printReaderPerformance(final int nExecutedTotal, + final int nExecutedInTick, + final int totalNumberOfFiles, + final SimpleTimer timer, + final double tickDurationInSec) { + final int pendingSize = totalNumberOfFiles - nExecutedTotal; + final double totalTimeInSeconds = timer.getElapsedTime(); + final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); + final int nRemaining = pendingSize; + final double estTimeToComplete = pendingSize / nTasksPerSecond; + logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", + nExecutedInTick, tickDurationInSec, + nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, + nRemaining, estTimeToComplete, estTimeToComplete / 60)); + } + + /** + * Return the header derived from the merging of these BAM files. + * @return the merged header. + */ + public SAMFileHeader getMergedHeader() { + return headerMerger.getMergedHeader(); + } + + /** + * Do multiple read groups collide in this dataset? + * @return True if multiple read groups collide; false otherwis. + */ + public boolean hasReadGroupCollisions() { + return headerMerger.hasReadGroupCollisions(); + } + + /** + * Get the newly mapped read group ID for the given read group. + * @param readerID Reader for which to discern the transformed ID. + * @param originalReadGroupID Original read group. + * @return Remapped read group. + */ + public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) { + SAMFileHeader header = readers.get(readerID).getFileHeader(); + return headerMerger.getReadGroupId(header,originalReadGroupID); + } + + /** + * Creates a new merging iterator from the given map, with the given header. + * @param iteratorMap A map of readers to iterators. + * @return An iterator which will merge those individual iterators. + */ + public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) { + return new MergingSamRecordIterator(headerMerger,iteratorMap,true); + } + + /** + * Retrieve the reader from the data structure. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public SAMFileReader getReader(SAMReaderID id) { + if(!readers.containsKey(id)) + throw new NoSuchElementException("No reader is associated with id " + id); + return readers.get(id); + } + + /** + * Retrieve the input stream backing a reader. + * @param id The ID of the reader to retrieve. + * @return the reader associated with the given id. + */ + public BlockInputStream getInputStream(final SAMReaderID id) { + return inputStreams.get(id); + } + + /** + * Searches for the reader id of this reader. + * @param reader Reader for which to search. + * @return The id associated the given reader, or null if the reader is not present in this collection. + */ + protected SAMReaderID getReaderID(SamReader reader) { + for(Map.Entry entry: readers.entrySet()) { + if(reader == entry.getValue()) + return entry.getKey(); + } + // Not found? return null. + return null; + } + + /** + * Returns an iterator over all readers in this structure. + * @return An iterator over readers. + */ + public Iterator iterator() { + return readers.values().iterator(); + } + + /** + * Returns whether any readers are present in this structure. + * @return + */ + public boolean isEmpty() { + return readers.isEmpty(); + } + } + + class ReaderInitializer implements Callable { + final SAMReaderID readerID; + BlockInputStream blockInputStream = null; + SAMFileReader reader; + + public ReaderInitializer(final SAMReaderID readerID) { + this.readerID = readerID; + } + + public ReaderInitializer call() { + final File indexFile = findIndexFile(readerID.getSamFile()); + try { + if (threadAllocation.getNumIOThreads() > 0) + blockInputStream = new BlockInputStream(dispatcher,readerID,false); + reader = new SAMFileReader(readerID.getSamFile(),indexFile,false); + } catch ( RuntimeIOException e ) { + throw new UserException.CouldNotReadInputFile(readerID.getSamFile(), e); + } catch ( SAMFormatException e ) { + throw new UserException.MalformedBAM(readerID.getSamFile(), e.getMessage()); + } + // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files). + // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case, + // just in case we want to change this behavior later. + catch ( RuntimeException e ) { + throw new UserException.MalformedBAM(readerID.getSamFile(), e.getMessage()); + } + reader.setSAMRecordFactory(factory); + reader.enableFileSource(true); + reader.setValidationStringency(validationStringency); + return this; + } + } + + private class ReleasingIterator implements GATKSAMIterator { + /** + * The resource acting as the source of the data. + */ + private final SAMReaders resource; + + /** + * The iterator to wrap. + */ + private final GATKSAMIterator wrappedIterator; + + public ReleasingIterator(SAMReaders resource, GATKSAMIterator wrapped) { + this.resource = resource; + this.wrappedIterator = wrapped; + } + + public ReleasingIterator iterator() { + return this; + } + + public void remove() { + throw new UnsupportedOperationException("Can't remove from a GATKSAMIterator"); + } + + public void close() { + wrappedIterator.close(); + resourcePool.releaseReaders(resource); + } + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecord next() { + return wrappedIterator.next(); + } + } + + /** + * Maps read groups in the original SAMFileReaders to read groups in + */ + private class ReadGroupMapping extends HashMap {} + + /** + * Locates the index file alongside the given BAM, if present. + * @param bamFile The data file to use. + * @return A File object if the index file is present; null otherwise. + */ + private File findIndexFile(File bamFile) { + return SamFiles.findIndex(bamFile); + } + + /** + * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream + * will be as granular as possible given our current knowledge of the best ways to split up BAM files. + * @return An iterator that spans all reads in all BAM files. + */ + public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any + * read that has been assigned + * + * @param shardBalancer shard balancer object + * @return non-null initialized version of the shard balancer + */ + public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { + shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); + return shardBalancer; + } + + /** + * Create a schedule for processing the initialized BAM file using the given interval list. + * The returned schedule should be as granular as possible. + * @param intervals The list of intervals for which to create the schedule. + * @return A granular iterator over file pointers. + */ + public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { + if(intervals == null) + throw new ReviewedGATKException("Unable to create schedule from intervals; no intervals were provided."); + shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals,intervalMergingRule),genomeLocParser); + return shardBalancer; + } +} + + + diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java new file mode 100644 index 000000000..eb9ec480a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java @@ -0,0 +1,254 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.SAMFileSpan; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.engine.ReadMetrics; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.HasGenomeLocation; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.util.Collections; +import java.util.List; +import java.util.Map; +/** + * + * User: aaron + * Date: Apr 10, 2009 + * Time: 5:00:27 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + +/** + * @author aaron + * @version 1.0 + * @date Apr 10, 2009 + *

+ * Interface Shard + *

+ * The base abstract class for shards. + */ +public abstract class Shard implements HasGenomeLocation { + public enum ShardType { + READ, LOCUS + } + + protected final GenomeLocParser parser; // incredibly annoying! + + /** + * What type of shard is this? Read or locus? + */ + protected final ShardType shardType; + + /** + * Locations. + */ + protected final List locs; + + /** + * Whether the current location is unmapped. + */ + private final boolean isUnmapped; + + /** + * Reads data, if applicable. + */ + private final SAMDataSource readsDataSource; + + /** + * The data backing the next chunks to deliver to the traversal engine. + */ + private final Map fileSpans; + + /** + * Lazy-calculated span of all of the genome locs in this shard + */ + private GenomeLoc spanningLocation = null; + + /** + * Statistics about which reads in this shards were used and which were filtered away. + */ + protected final ReadMetrics readMetrics = new ReadMetrics(); + + /** + * Whether this shard points to an unmapped region. + * Some shard types conceptually be unmapped (e.g. LocusShards). In + * this case, isUnmapped should always return false. + * @return True if this shard is unmapped. False otherwise. + */ + public boolean isUnmapped() { + return isUnmapped; + } + + public Shard(GenomeLocParser parser, + ShardType shardType, + List locs, + SAMDataSource readsDataSource, + Map fileSpans, + boolean isUnmapped) { + this.locs = locs; + this.parser = parser; + this.shardType = shardType; + this.readsDataSource = readsDataSource; + this.fileSpans = fileSpans; + this.isUnmapped = isUnmapped; + } + + /** + * If isUnmapped is true, than getGenomeLocs by + * definition will return a singleton list with a GenomeLoc.UNMAPPED + * + * Can return null, indicating that the entire genome is covered. + * + * @return the genome location represented by this shard + */ + public List getGenomeLocs() { + return locs; + } + + /** + * Get the list of chunks delimiting this shard. + * @return a list of chunks that contain data for this shard. + */ + public Map getFileSpans() { + return Collections.unmodifiableMap(fileSpans); + } + + /** + * Returns the span of the genomeLocs comprising this shard + * @return a GenomeLoc that starts as the first position in getGenomeLocs() and stops at the stop of the last + * position in getGenomeLocs() + */ + public GenomeLoc getLocation() { + if ( spanningLocation == null ) { + if ( getGenomeLocs() == null ) + spanningLocation = GenomeLoc.WHOLE_GENOME; + else if ( getGenomeLocs().size() == 0 ) { + spanningLocation = getGenomeLocs().get(0); + } else { + int start = Integer.MAX_VALUE; + int stop = Integer.MIN_VALUE; + String contig = null; + + for ( GenomeLoc loc : getGenomeLocs() ) { + if ( GenomeLoc.isUnmapped(loc) ) + // special case the unmapped region marker, just abort out + return loc; + contig = loc.getContig(); + if ( loc.getStart() < start ) start = loc.getStart(); + if ( loc.getStop() > stop ) stop = loc.getStop(); + } + + spanningLocation = parser.createGenomeLoc(contig, start, stop); + } + } + + return spanningLocation; + } + + + /** + * what kind of shard do we return + * @return ShardType, indicating the type + */ + public ShardType getShardType() { + return shardType; + } + + /** + * Does any releasing / aggregation required when the shard is through being processed. + */ + public void close() { + readsDataSource.incorporateReadMetrics(readMetrics); + } + + /** + * Gets key read validation and filtering properties. + * @return set of read properties associated with this shard. + */ + public ReadProperties getReadProperties() { + return readsDataSource.getReadsInfo(); + } + + /** + * Gets the runtime metrics associated with this shard. + * Retrieves a storage space of metrics about number of reads included, filtered, etc. + * @return Storage space for metrics. + */ + public ReadMetrics getReadMetrics() { + return readMetrics; + } + + /** + * Returns true if this shard is meant to buffer reads, rather + * than just holding pointers to their locations. + * @return True if this shard can buffer reads. False otherwise. + */ + public boolean buffersReads() { return false; } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferEmpty() { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Returns true if the read buffer is currently full. + * @return True if this shard's buffer is full (and the shard can buffer reads). + */ + public boolean isBufferFull() { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Adds a read to the read buffer. + * @param read Add a read to the internal shard buffer. + */ + public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Fills the shard with reads. Can only do this with shards that buffer reads + * @param readIter Iterator from which to draw the reads to fill the shard + */ + public void fill( PeekableIterator readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); } + + /** + * Gets the iterator over the elements cached in the shard. + * @return + */ + public GATKSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ShardBalancer.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMFileStat.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/BAMTagRenamer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java new file mode 100644 index 000000000..231bbc4ef --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java @@ -0,0 +1,192 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads.utilities; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.utils.commandline.Input; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.engine.datasources.reads.FilePointer; +import org.broadinstitute.gatk.engine.datasources.reads.IntervalSharder; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.text.ListFileUtils; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.List; + +/** + * Traverses a region in a dataset looking for outliers. + */ +public class FindLargeShards extends CommandLineProgram { + private static Logger logger = Logger.getLogger(FindLargeShards.class); + + @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) + public List samFiles = new ArrayList(); + + @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) + public File referenceFile = null; + + @Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.",required=false) + public List intervals = null; + + @Output(required=false) + public PrintStream out = System.out; + + /** + * The square of the sum of all uncompressed data. Based on the BAM spec, the size of this could be + * up to (2^64)^2. + */ + private BigInteger sumOfSquares = BigInteger.valueOf(0); + + /** + * The running sum of all uncompressed data. Based on the BAM spec, the BAM must be less than Long.MAX_LONG + * when compressed -- in other words, the sum of the sizes of all BGZF blocks must be < 2^64. + */ + private BigInteger sum = BigInteger.valueOf(0); + + /** + * The number of shards viewed. + */ + private long numberOfShards; + + + @Override + public int execute() throws IOException { + // initialize reference + IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); + GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); + + // initialize reads + List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); + SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); + + // intervals + final GenomeLocSortedSet intervalSortedSet; + if ( intervals != null ) + intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); + else + intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary()); + + logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); + + IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); + while(sharder.hasNext()) { + FilePointer filePointer = sharder.next(); + + // Size of the file pointer. + final long size = filePointer.size(); + + BigInteger bigSize = BigInteger.valueOf(size); + sumOfSquares = sumOfSquares.add(bigSize.pow(2)); + sum = sum.add(bigSize); + numberOfShards++; + + if(numberOfShards % 1000 == 0) { + GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); + logger.info(String.format("PROGRESS: Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); + } + + } + + // Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N + long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); + long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); + logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev)); + + // Crank through the shards again, this time reporting on the shards significantly larger than the mean. + long threshold = mean + stddev*5; + logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); + out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); + + sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); + while(sharder.hasNext()) { + FilePointer filePointer = sharder.next(); + + // Bounding region. + GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); + + // Size of the file pointer. + final long size = filePointer.size(); + + numberOfShards++; + + if(filePointer.size() <= threshold) { + if(numberOfShards % 1000 == 0) + logger.info(String.format("PROGRESS: Searching for large shards: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); + continue; + } + + out.printf("%s\t%d\t%d\t%d%n",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size); + } + + return 0; + } + + private GenomeLoc getBoundingRegion(final FilePointer filePointer, final GenomeLocParser genomeLocParser) { + List regions = filePointer.getLocations(); + + // The region contained by this FilePointer. + final String contig = regions.get(0).getContig(); + final int start = regions.get(0).getStart(); + final int stop = regions.get(regions.size()-1).getStop(); + + return genomeLocParser.createGenomeLoc(contig,start,stop); + } + + /** + * Required main method implementation. + * @param argv Command-line argument text. + * @throws Exception on error. + */ + public static void main(String[] argv) throws Exception { + int returnCode = 0; + try { + FindLargeShards instance = new FindLargeShards(); + start(instance, argv); + returnCode = 0; + } + catch(Exception ex) { + returnCode = 1; + ex.printStackTrace(); + throw ex; + } + finally { + System.exit(returnCode); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBAMRegion.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/PrintBGZFBounds.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/UnzipSingleBlock.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java new file mode 100644 index 000000000..6b7bf2187 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java @@ -0,0 +1,166 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reference; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Loads reference data from fasta file + * Looks for fai and dict files, and tries to create them if they don't exist + */ +public class ReferenceDataSource { + private IndexedFastaSequenceFile reference; + + /** our log, which we want to capture anything from this class */ + protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); + + /** + * Create reference data source from fasta file + * @param fastaFile Fasta file to be used as reference + */ + public ReferenceDataSource(File fastaFile) { + reference = CachingIndexedFastaSequenceFile.checkAndCreate(fastaFile); + } + + /** + * Get indexed fasta file + * @return IndexedFastaSequenceFile that was created from file + */ + public IndexedFastaSequenceFile getReference() { + return this.reference; + } + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. + * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param maxShardSize The maximum shard size which can be used to create this list. + * @return Creates a schedule for performing a traversal over the entire reference. + */ + public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { + List shards = new ArrayList(); + for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { + for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { + final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); + shards.add(new LocusShard(parser, + readsDataSource, + Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), + null)); + } + } + return shards; + } + + + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { + List shards = new ArrayList(); + + for(GenomeLoc interval: intervals) { + while(interval.size() > maxShardSize) { + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), + null)); + interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + } + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(interval), + null)); + } + + return shards; + } + + + /** + * Creates an iterator for processing the entire reference. + * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. + * @param intervals the list of intervals to use when processing the reference. + * @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size. + * @return Creates a schedule for performing a traversal over the entire reference. + */ +/* + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) { + final List shards = new ArrayList(); + final GenomeLocParser parser = intervals.getGenomeLocParser(); + LinkedList currentIntervals = new LinkedList(); + + for(GenomeLoc interval: intervals) { + // if the next interval is too big, we can safely shard currentInterval and then break down this one + if (interval.size() > targetShardSize) { + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + while(interval.size() > targetShardSize) { + final GenomeLoc partialInterval = parser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getStart()+targetShardSize-1); + shards.add(createShardFromInterval(Collections.singletonList(partialInterval), readsDataSource, parser)); + interval = parser.createGenomeLoc(interval.getContig(), interval.getStart() + targetShardSize, interval.getStop()); + } + currentIntervals = new LinkedList(); + currentIntervals.add(interval); + } + // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly) + else { + if (currentIntervals.isEmpty()) { + currentIntervals.add(interval); + } + else { + if (currentIntervals.getLast().compareContigs(interval) != 0 || interval.getStop() - currentIntervals.getLast().getStart() + 1 > targetShardSize) { + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + currentIntervals = new LinkedList(); + } + currentIntervals.add(interval); + } + } + } + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + return shards; + } + + private static Shard createShardFromInterval(final List intervals, final SAMDataSource readsDataSource, final GenomeLocParser parser) { + //logger.debug("Adding shard " + interval); + return new LocusShard(parser, + readsDataSource, + intervals, + null); + } +*/ +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/DataStreamSegment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/EntireStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/MappedStreamSegment.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java new file mode 100644 index 000000000..6920ba242 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java @@ -0,0 +1,153 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.broadinstitute.gatk.utils.refdata.SeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrack; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.FlashBackIterator; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.List; + +/** + * A pool of reference-ordered data iterators. + */ +class ReferenceOrderedDataPool extends ResourcePool { + // the reference-ordered data itself. + private final RMDTriplet fileDescriptor; + + // our tribble track builder + private final RMDTrackBuilder builder; + + /** + * The header from this RMD, if present. + */ + private final Object header; + + /** + * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. + */ + private final SAMSequenceDictionary sequenceDictionary; + + boolean flashbackData = false; + public ReferenceOrderedDataPool(RMDTriplet fileDescriptor,RMDTrackBuilder builder,SAMSequenceDictionary sequenceDictionary, GenomeLocParser genomeLocParser,boolean flashbackData) { + super(sequenceDictionary,genomeLocParser); + this.fileDescriptor = fileDescriptor; + this.builder = builder; + this.flashbackData = flashbackData; + + // prepopulate one RMDTrack + LocationAwareSeekableRODIterator iterator = createNewResource(); + this.addNewResource(iterator); + + // Pull the proper header and sequence dictionary from the prepopulated track. + this.header = iterator.getHeader(); + this.sequenceDictionary = iterator.getSequenceDictionary(); + } + + /** + * Gets the header used by this resource pool. + * @return Header used by this resource pool. + */ + public Object getHeader() { + return header; + } + + /** + * Gets the sequence dictionary built into the ROD index file. + * @return Sequence dictionary from the index file. + */ + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + /** + * Create a new iterator from the existing reference-ordered data. This new iterator is expected + * to be completely independent of any other iterator. + * @return The newly created resource. + */ + public LocationAwareSeekableRODIterator createNewResource() { + if(numIterators() > 0) + throw new ReviewedGATKException("BUG: Tried to create multiple iterators over streaming ROD interface"); + RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); + LocationAwareSeekableRODIterator iter = new SeekableRODIterator(track.getHeader(),track.getSequenceDictionary(),referenceSequenceDictionary,genomeLocParser,track.getIterator()); + return (flashbackData) ? new FlashBackIterator(iter) : iter; + } + + /** + * Finds the best existing ROD iterator from the pool. In this case, the best existing ROD is defined as + * the first one encountered that is at or before the given position. + * @param segment @{inheritedDoc} + * @param resources @{inheritedDoc} + * @return @{inheritedDoc} + */ + public LocationAwareSeekableRODIterator selectBestExistingResource( DataStreamSegment segment, List resources ) { + if(segment instanceof MappedStreamSegment) { + GenomeLoc position = ((MappedStreamSegment)segment).getLocation(); + + for( LocationAwareSeekableRODIterator RODIterator : resources ) { + + if( (RODIterator.position() == null && RODIterator.hasNext()) || + (RODIterator.position() != null && RODIterator.position().isBefore(position)) ) + return RODIterator; + if (RODIterator.position() != null && RODIterator instanceof FlashBackIterator && ((FlashBackIterator)RODIterator).canFlashBackTo(position)) { + ((FlashBackIterator)RODIterator).flashBackTo(position); + return RODIterator; + } + + } + return null; + } + else if(segment instanceof EntireStream) { + // Asking for a segment over the entire stream, so by definition, there is no best existing resource. + // Force the system to create a new one. + return null; + } + else { + throw new ReviewedGATKException("Unable to find a ROD iterator for segments of type " + segment.getClass()); + } + } + + /** + * In this case, the iterator is the resource. Pass it through. + */ + public LocationAwareSeekableRODIterator createIteratorFromResource( DataStreamSegment segment, LocationAwareSeekableRODIterator resource ) { + return resource; + } + + /** + * kill the buffers in the iterator + */ + public void closeResource( LocationAwareSeekableRODIterator resource ) { + if (resource instanceof FlashBackIterator) ((FlashBackIterator)resource).close(); + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java new file mode 100644 index 000000000..c88b9b7f2 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java @@ -0,0 +1,256 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.refdata.SeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrack; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.lang.reflect.Type; +import java.util.List; + +/** + * A data source which provides a single type of reference-ordered data. + */ +public class ReferenceOrderedDataSource { + /** + * The reference-ordered data itself. + */ + private final RMDTriplet fileDescriptor; + + /** + * The header associated with this VCF, if any. + */ + private final Object header; + + /** + * The private sequence dictionary associated with this RMD. + */ + private final SAMSequenceDictionary sequenceDictionary; + + /** + * The builder to use when constructing new reference-ordered data readers. + */ + private final RMDTrackBuilder builder; + + /** + * A pool of iterators for navigating through the genome. + */ + private final ResourcePool iteratorPool; + + /** + * Create a new reference-ordered data source. + */ + public ReferenceOrderedDataSource(RMDTriplet fileDescriptor, + RMDTrackBuilder builder, + SAMSequenceDictionary referenceSequenceDictionary, + GenomeLocParser genomeLocParser, + boolean flashbackData ) { + this.fileDescriptor = fileDescriptor; + this.builder = builder; + + // TODO: Unify the two blocks of code below by creating a ReferenceOrderedDataPool base class of a coherent type (not RMDTrack for one and SeekableIterator for the other). + if (fileDescriptor.getStorageType() != RMDTriplet.RMDStorageType.STREAM) { + iteratorPool = new ReferenceOrderedQueryDataPool(fileDescriptor, + builder, + referenceSequenceDictionary, + genomeLocParser); + this.header = ((ReferenceOrderedQueryDataPool)iteratorPool).getHeader(); + this.sequenceDictionary = ((ReferenceOrderedQueryDataPool)iteratorPool).getSequenceDictionary(); + } + else { + iteratorPool = new ReferenceOrderedDataPool(fileDescriptor, + builder, + referenceSequenceDictionary, + genomeLocParser, + flashbackData); + this.header = ((ReferenceOrderedDataPool)iteratorPool).getHeader(); + this.sequenceDictionary = ((ReferenceOrderedDataPool)iteratorPool).getSequenceDictionary(); + } + } + + /** + * Return the name of the underlying reference-ordered data. + * @return Name of the underlying rod. + */ + public String getName() { + return fileDescriptor.getName(); + } + + public Class getType() { + return builder.getFeatureManager().getByTriplet(fileDescriptor).getCodecClass(); + } + + public Class getRecordType() { + return builder.getFeatureManager().getByTriplet(fileDescriptor).getFeatureClass(); + } + + public File getFile() { + return new File(fileDescriptor.getFile()); + } + + public Object getHeader() { + return header; + } + + public Tags getTags() { + return fileDescriptor.getTags(); + } + + public String getTagValue( final String key ) { + return fileDescriptor.getTags().getValue( key ); + } + + + /** + * Retrieves the sequence dictionary created by this ROD. + * @return + */ + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + /** + * helper function for determining if we are the same track based on name and record type + * + * @param name the name to match + * @param type the type to match + * + * @return true on a match, false if the name or type is different + */ + public boolean matchesNameAndRecordType(String name, Type type) { + return (name.equals(fileDescriptor.getName()) && (type.getClass().isAssignableFrom(getType().getClass()))); + } + + /** + * Seek to the specified position and return an iterator through the data. + * + * @param loc GenomeLoc that points to the selected position. + * + * @return Iterator through the data. + */ + public LocationAwareSeekableRODIterator seek(GenomeLoc loc) { + DataStreamSegment dataStreamSegment = loc != null ? new MappedStreamSegment(loc) : new EntireStream(); + return iteratorPool.iterator(dataStreamSegment); + } + + + /** + * Close the specified iterator, returning it to the pool. + * @param iterator Iterator to close. + */ + public void close( LocationAwareSeekableRODIterator iterator ) { + iteratorPool.release(iterator); + } + +} + +/** + * a data pool for the new query based RODs + */ +class ReferenceOrderedQueryDataPool extends ResourcePool { + // the reference-ordered data itself. + private final RMDTriplet fileDescriptor; + + // our tribble track builder + private final RMDTrackBuilder builder; + + /** + * The header from this RMD, if present. + */ + private final Object header; + + /** + * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. + */ + private final SAMSequenceDictionary sequenceDictionary; + + public ReferenceOrderedQueryDataPool(RMDTriplet fileDescriptor, RMDTrackBuilder builder, SAMSequenceDictionary referenceSequenceDictionary, GenomeLocParser genomeLocParser) { + super(referenceSequenceDictionary,genomeLocParser); + this.fileDescriptor = fileDescriptor; + this.builder = builder; + + // prepopulate one RMDTrack + RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); + this.addNewResource(track); + + // Pull the proper header and sequence dictionary from the prepopulated track. + this.header = track.getHeader(); + this.sequenceDictionary = track.getSequenceDictionary(); + } + + public Object getHeader() { + return header; + } + + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + @Override + protected RMDTrack createNewResource() { + return builder.createInstanceOfTrack(fileDescriptor); + } + + @Override + protected RMDTrack selectBestExistingResource(DataStreamSegment segment, List availableResources) { + for (RMDTrack reader : availableResources) + if (reader != null) return reader; + return null; + } + + @Override + protected LocationAwareSeekableRODIterator createIteratorFromResource(DataStreamSegment position, RMDTrack track) { + try { + if (position instanceof MappedStreamSegment) { + GenomeLoc pos = ((MappedStreamSegment) position).locus; + return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.query(pos)); + } else { + return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator()); + } + } catch (FileNotFoundException e) { + throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found"); + } catch (IOException e) { + throw new ReviewedGATKException("Unable to create iterator for rod named " + fileDescriptor.getName(),e); + } + } + + @Override + protected void closeResource(RMDTrack track) { + track.close(); + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ResourcePool.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/Accumulator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroScheduler.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/HierarchicalMicroSchedulerMBean.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java new file mode 100644 index 000000000..fc68b9c7a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java @@ -0,0 +1,130 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.executive; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.io.DirectOutputTracker; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.traversals.TraversalEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; + +import java.util.Collection; + + +/** A micro-scheduling manager for single-threaded execution of a traversal. */ +public class LinearMicroScheduler extends MicroScheduler { + + /** + * A direct output tracker for directly managing output. + */ + private DirectOutputTracker outputTracker = new DirectOutputTracker(); + + /** + * Create a new linear microscheduler to process the given reads and reference. + * + * @param walker Walker for the traversal. + * @param reads Reads file(s) to process. + * @param reference Reference for driving the traversal. + * @param rods Reference-ordered data. + */ + protected LinearMicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); + + if ( threadAllocation.monitorThreadEfficiency() ) + setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); + } + + /** + * Run this traversal over the specified subsection of the dataset. + * + * @param walker Computation to perform over dataset. + * @param shardStrategy A strategy for sharding the data. + */ + public Object execute(Walker walker, Iterable shardStrategy) { + super.startingExecution(); + walker.initialize(); + Accumulator accumulator = Accumulator.create(engine,walker); + + boolean done = walker.isDone(); + int counter = 0; + + final TraversalEngine traversalEngine = borrowTraversalEngine(this); + for (Shard shard : shardStrategy ) { + if ( abortExecution() || done || shard == null ) // we ran out of shards that aren't owned + break; + + if(shard.getShardType() == Shard.ShardType.LOCUS) { + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), + getReadIterator(shard), shard.getGenomeLocs(), ReadUtils.getSAMFileSamples(engine.getSAMFileHeader())); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + accumulator.accumulate(dataProvider,result); + dataProvider.close(); + if ( walker.isDone() ) break; + } + windowMaker.close(); + } + else { + ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + accumulator.accumulate(dataProvider,result); + dataProvider.close(); + } + + done = walker.isDone(); + } + + Object result = accumulator.finishTraversal(); + + outputTracker.close(); + returnTraversalEngine(this, traversalEngine); + cleanup(); + executionIsDone(); + + return accumulator; + } + + /** + * @{inheritDoc} + */ + public OutputTracker getOutputTracker() { return outputTracker; } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java new file mode 100644 index 000000000..f9660a94a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java @@ -0,0 +1,463 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.executive; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.ReadMetrics; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.iterators.NullSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.traversals.*; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.AutoFormattingTime; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; + +import javax.management.JMException; +import javax.management.MBeanServer; +import javax.management.ObjectName; +import java.io.File; +import java.lang.management.ManagementFactory; +import java.util.*; + + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Apr 26, 2009 + * Time: 12:37:23 PM + * + * General base class for all scheduling algorithms + * Shards and schedules data in manageable chunks. + * + * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary + * because in the HMS case you have multiple threads executing a traversal engine independently, and + * these engines may need to create separate resources for efficiency or implementation reasons. For example, + * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. + * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have + * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler + * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler + * can properly shut them all down when the scheduling is done. + * + */ +public abstract class MicroScheduler implements MicroSchedulerMBean { + protected static final Logger logger = Logger.getLogger(MicroScheduler.class); + + /** + * The list of all Traversal engines we've created in this micro scheduler + */ + final List allCreatedTraversalEngines = new LinkedList(); + + /** + * All available engines. Engines are borrowed and returned when a subclass is actually + * going to execute the engine on some data. This allows us to have N copies for + * N data parallel executions, but without the dangerous code of having local + * ThreadLocal variables. + */ + final LinkedList availableTraversalEngines = new LinkedList(); + + /** + * Engines that have been allocated to a key already. + */ + final HashMap allocatedTraversalEngines = new HashMap(); + + /** + * Counts the number of instances of the class that are currently alive. + */ + private static int instanceNumber = 0; + + /** + * The engine invoking this scheduler. + */ + protected final GenomeAnalysisEngine engine; + + protected final IndexedFastaSequenceFile reference; + + private final SAMDataSource reads; + protected final Collection rods; + + private final MBeanServer mBeanServer; + private final ObjectName mBeanName; + + /** + * Threading efficiency monitor for tracking the resource utilization of the GATK + * + * may be null + */ + ThreadEfficiencyMonitor threadEfficiencyMonitor = null; + + /** + * MicroScheduler factory function. Create a microscheduler appropriate for reducing the + * selected walker. + * + * @param walker Which walker to use. + * @param reads the informations associated with the reads + * @param reference the reference file + * @param rods the rods to include in the traversal + * @param threadAllocation Number of threads to utilize. + * + * @return The best-fit microscheduler. + */ + public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { + if ( threadAllocation.isRunningInParallelMode() ) { + logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + + "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", + threadAllocation.getTotalNumThreads(), + threadAllocation.getNumCPUThreadsPerDataThread(), + threadAllocation.getNumDataThreads(), + Runtime.getRuntime().availableProcessors())); + if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) + logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + + "available processors on this machine %d", threadAllocation.getTotalNumThreads(), + Runtime.getRuntime().availableProcessors())); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + if (walker.isReduceByInterval()) + throw new UserException.BadArgumentValue("nt", String.format("This run of %s is set up to aggregate results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option or check if this tool has an option to disable per-interval calculations.", engine.getWalkerName(walker.getClass()))); + + if ( ! (walker instanceof TreeReducible) ) { + throw badNT("nt", engine, walker); + } + } + + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { + throw badNT("nct", engine, walker); + } + + if ( threadAllocation.getNumDataThreads() > 1 ) { + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } else { + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); + } + } + + private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue(parallelArg, + String.format("The analysis %s currently does not support parallel execution with %s. " + + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); + } + + /** + * Create a microscheduler given the reads and reference. + * + * @param walker the walker to execute with + * @param reads The reads. + * @param reference The reference. + * @param rods the rods to include in the traversal + * @param threadAllocation the allocation of threads to use in the underlying traversal + */ + protected MicroScheduler(final GenomeAnalysisEngine engine, + final Walker walker, + final SAMDataSource reads, + final IndexedFastaSequenceFile reference, + final Collection rods, + final ThreadAllocation threadAllocation) { + this.engine = engine; + this.reads = reads; + this.reference = reference; + this.rods = rods; + + final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; + + // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, + // and adds it to the list of created engines for later shutdown. + for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { + final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); + allCreatedTraversalEngines.add(traversalEngine); + availableTraversalEngines.add(traversalEngine); + } + + // Create the progress meter, and register it with the analysis engine + engine.registerProgressMeter(new ProgressMeter(progressLogFile, + availableTraversalEngines.peek().getTraversalUnits(), + engine.getRegionsOfGenomeBeingProcessed())); + + // Now that we have a progress meter, go through and initialize the traversal engines + for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) + traversalEngine.initialize(engine, walker, engine.getProgressMeter()); + + // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. + // To get around this limitation and since we have no job identifier at this point, register a simple counter that + // will count the number of instances of this object that have been created in this JVM. + int thisInstance = instanceNumber++; + mBeanServer = ManagementFactory.getPlatformMBeanServer(); + try { + mBeanName = new ObjectName("org.broadinstitute.gatk.engine.executive:type=MicroScheduler,instanceNumber="+thisInstance); + mBeanServer.registerMBean(this, mBeanName); + } + catch (JMException ex) { + throw new ReviewedGATKException("Unable to register microscheduler with JMX", ex); + } + } + + /** + * Really make us a traversal engine of the appropriate type for walker and thread allocation + * + * @return a non-null uninitialized traversal engine + */ + @Ensures("result != null") + private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { + if (walker instanceof ReadWalker) { + return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof LocusWalker) { + return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); + } else if (walker instanceof DuplicateWalker) { + return new TraverseDuplicates(); + } else if (walker instanceof ReadPairWalker) { + return new TraverseReadPairs(); + } else if (walker instanceof ActiveRegionWalker) { + return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); + } else { + throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); + } + } + + + /** + * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one + * + * @return the monitor, or null if none is active + */ + public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { + return threadEfficiencyMonitor; + } + + /** + * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses + * + * @param threadEfficiencyMonitor + */ + public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { + this.threadEfficiencyMonitor = threadEfficiencyMonitor; + } + + /** + * Should we stop all execution work and exit gracefully? + * + * Returns true in the case where some external signal or time limit has been received, indicating + * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown + * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler + * examine this value as often as reasonable and, if it returns true, stop what they are doing + * at the next available opportunity, shutdown their resources, call notify done, and return. + * + * @return true if we should abort execution, or false otherwise + */ + protected boolean abortExecution() { + final boolean abort = engine.exceedsRuntimeLimit(); + if ( abort ) { + final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); + logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); + } + return abort; + } + + /** + * Walks a walker over the given list of intervals. + * + * @param walker Computation to perform over dataset. + * @param shardStrategy A strategy for sharding the data. + * + * @return the return type of the walker + */ + public abstract Object execute(Walker walker, Iterable shardStrategy); + + /** + * Tells this MicroScheduler that the execution of one of the subclass of this object as started + * + * Must be called when the implementation of execute actually starts up + * + * Currently only starts the progress meter timer running, but other start up activities could be incorporated + */ + protected void startingExecution() { + engine.getProgressMeter().start(); + } + + /** + * Retrieves the object responsible for tracking and managing output. + * @return An output tracker, for loading data in and extracting results. Will not be null. + */ + public abstract OutputTracker getOutputTracker(); + + /** + * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. + * @param shard the shard to use when querying reads. + * @return an iterator over the reads specified in the shard. + */ + protected GATKSAMIterator getReadIterator(Shard shard) { + return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); + } + + /** + * Must be called by subclasses when execute is done + */ + protected void executionIsDone() { + engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); + printReadFilteringStats(); + shutdownTraversalEngines(); + + // Print out the threading efficiency of this HMS, if state monitoring is enabled + if ( threadEfficiencyMonitor != null ) { + // include the master thread information + threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); + threadEfficiencyMonitor.printUsageInformation(logger); + } + } + + /** + * Shutdown all of the created engines, and clear the list of created engines, dropping + * pointers to the traversal engines + */ + public synchronized void shutdownTraversalEngines() { + for ( final TraversalEngine te : allCreatedTraversalEngines) + te.shutdown(); + + allCreatedTraversalEngines.clear(); + availableTraversalEngines.clear(); + } + + /** + * Prints out information about number of reads observed and filtering, if any reads were used in the traversal + * + * Looks like: + * + * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter + * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter + * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter + */ + private void printReadFilteringStats() { + final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); + if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { + // count up the number of skipped reads by summing over all filters + long nSkippedReads = 0L; + for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) + nSkippedReads += countsByFilter; + + logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", + nSkippedReads, + cumulativeMetrics.getNumReadsSeen(), + 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); + + for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { + long count = filterCounts.getValue(); + logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", + count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); + } + } + } + + /** + * Gets the engine that created this microscheduler. + * @return The engine owning this microscheduler. + */ + public GenomeAnalysisEngine getEngine() { return engine; } + + /** + * Returns data source maintained by this scheduler + * @return + */ + public SAMDataSource getSAMDataSource() { return reads; } + + /** + * Returns the reference maintained by this scheduler. + * @return The reference maintained by this scheduler. + */ + public IndexedFastaSequenceFile getReference() { return reference; } + + protected void cleanup() { + try { + mBeanServer.unregisterMBean(mBeanName); + } + catch (JMException ex) { + throw new ReviewedGATKException("Unable to unregister microscheduler with JMX", ex); + } + } + + /** + * Returns a traversal engine suitable for use, associated with key + * + * Key is an arbitrary object that is used to retrieve the same traversal + * engine over and over. This can be important in the case where the + * traversal engine has data associated with it in some other context, + * and we need to ensure that the context always sees the same traversal + * engine. This happens in the HierarchicalMicroScheduler, where you want + * the a thread executing traversals to retrieve the same engine each time, + * as outputs are tracked w.r.t. that engine. + * + * If no engine is associated with key yet, pops the next available engine + * from the available ones maintained by this + * microscheduler. Note that it's a runtime error to pop a traversal engine + * from this scheduler if there are none available. Callers that + * once pop'd an engine for use must return it with returnTraversalEngine + * + * @param key the key to associate with this engine + * @return a non-null TraversalEngine suitable for execution in this scheduler + */ + @Ensures("result != null") + protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { + if ( key == null ) throw new IllegalArgumentException("key cannot be null"); + + final TraversalEngine engine = allocatedTraversalEngines.get(key); + if ( engine == null ) { + if ( availableTraversalEngines.isEmpty() ) + throw new IllegalStateException("no traversal engines were available"); + allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); + return allocatedTraversalEngines.get(key); + } else { + return engine; + } + } + + /** + * Return a borrowed traversal engine to this MicroScheduler, for later use + * in another traversal execution + * + * @param key the key used to id the engine, provided to the borrowTraversalEngine function + * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. + */ + protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { + if ( traversalEngine == null ) + throw new IllegalArgumentException("Attempting to push a null traversal engine"); + if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) + throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); + if ( ! allocatedTraversalEngines.containsKey(key) ) + throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); + + // note there's nothing to actually do here, but a function implementation + // might want to do something + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/MicroSchedulerMBean.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/OutputMergeTask.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ReduceTree.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/ShardTraverser.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/TreeReducer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java new file mode 100644 index 000000000..496178d88 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java @@ -0,0 +1,218 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.executive; + +import htsjdk.samtools.util.PeekableIterator; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; +import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; + +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; + +/** + * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci + * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp + * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of + * loci to only those covered by the given interval list. + * + * Example: + * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10 + * Incoming intervals: chr20:3-7 + * + * Locus iterator by state will produce the following stream of data: + * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, + * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E} + * + * WindowMakerIterator will then filter the incoming stream, emitting the following stream: + * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E} + * + * @author mhanna + * @version 0.1 + */ +public class WindowMaker implements Iterable, Iterator { + /** + * Source information for iteration. + */ + private final ReadProperties sourceInfo; + + /** + * Hold the read iterator so that it can be closed later. + */ + private final GATKSAMRecordIterator readIterator; + + /** + * The data source for reads. Will probably come directly from the BAM file. + */ + private final PeekableIterator sourceIterator; + + /** + * Stores the sequence of intervals that the windowmaker should be tracking. + */ + private final PeekableIterator intervalIterator; + + /** + * In the case of monolithic sharding, this case returns whether the only shard has been generated. + */ + private boolean shardGenerated = false; + + /** + * The alignment context to return from this shard's iterator. Lazy implementation: the iterator will not find the + * currentAlignmentContext until absolutely required to do so. If currentAlignmentContext is null and advance() + * doesn't populate it, no more elements are available. If currentAlignmentContext is non-null, currentAlignmentContext + * should be returned by next(). + */ + private AlignmentContext currentAlignmentContext; + + /** + * Create a new window maker with the given iterator as a data source, covering + * the given intervals. + * @param iterator The data source for this window. + * @param intervals The set of intervals over which to traverse. + * @param sampleNames The complete set of sample names in the reads in shard + */ + + private final LocusIteratorByState libs; + + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals, Collection sampleNames) { + this.sourceInfo = shard.getReadProperties(); + this.readIterator = new GATKSAMRecordIterator(iterator); + + this.libs = new LocusIteratorByState(readIterator, + sourceInfo.getDownsamplingMethod(), sourceInfo.includeReadsWithDeletionAtLoci(), + sourceInfo.keepUniqueReadListInLIBS(), genomeLocParser,sampleNames); + this.sourceIterator = new PeekableIterator(libs); + + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; + } + + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals ) { + this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + + public Iterator iterator() { + return this; + } + + public boolean hasNext() { + return (intervalIterator != null && intervalIterator.hasNext()) || !shardGenerated; + } + + public WindowMakerIterator next() { + shardGenerated = true; + return new WindowMakerIterator(intervalIterator != null ? intervalIterator.next() : null); + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a window maker."); + } + + public void close() { + this.readIterator.close(); + } + + public class WindowMakerIterator extends LocusIterator { + /** + * The locus for which this iterator is currently returning reads. + */ + private final GenomeLoc locus; + + public WindowMakerIterator(GenomeLoc locus) { + this.locus = locus; + advance(); + } + + public ReadProperties getSourceInfo() { + return sourceInfo; + } + + public GenomeLoc getLocus() { + return locus; + } + + public WindowMakerIterator iterator() { + return this; + } + + public boolean hasNext() { + advance(); + return currentAlignmentContext != null; + } + + public AlignmentContext next() { + if(!hasNext()) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); + + // Consume this alignment context. + AlignmentContext toReturn = currentAlignmentContext; + currentAlignmentContext = null; + + // Return the current element. + return toReturn; + } + + private void advance() { + // Need to find the next element that is not past shard boundaries. If we travel past the edge of + // shard boundaries, stop and let the next interval pick it up. + while(currentAlignmentContext == null && sourceIterator.hasNext()) { + // Advance the iterator and try again. + AlignmentContext candidateAlignmentContext = sourceIterator.peek(); + + if(locus == null) { + // No filter present. Return everything that LocusIteratorByState provides us. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isPast(candidateAlignmentContext.getLocation())) + // Found a locus before the current window; claim this alignment context and throw it away. + sourceIterator.next(); + else if(locus.containsP(candidateAlignmentContext.getLocation())) { + // Found a locus within the current window; claim this alignment context and call it the next entry. + currentAlignmentContext = sourceIterator.next(); + } + else if(locus.isBefore(candidateAlignmentContext.getLocation())) { + // Whoops. Skipped passed the end of the region. Iteration for this window is complete. Do + // not claim this alignment context in case it is part of the next shard. + break; + } + else + throw new ReviewedGATKException("BUG: filtering locus does not contain, is not before, and is not past the given alignment context"); + } + } + + @Override + public LocusIteratorByState getLIBS() { + return libs; + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/executive/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java new file mode 100644 index 000000000..f0e889a63 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BAQReadTransformer.java @@ -0,0 +1,75 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.walkers.BAQMode; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * Applies Heng's BAQ calculation to a stream of incoming reads + */ +public class BAQReadTransformer extends ReadTransformer { + private BAQ baqHMM; + private IndexedFastaSequenceFile refReader; + private BAQ.CalculationMode cmode; + private BAQ.QualityMode qmode; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); + this.refReader = engine.getReferenceDataSource().getReference(); + this.cmode = engine.getArguments().BAQMode; + this.qmode = mode.QualityMode(); + baqHMM = new BAQ(engine.getArguments().BAQGOP); + + if ( qmode == BAQ.QualityMode.DONT_MODIFY ) + throw new ReviewedGATKException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); + + if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) + throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); + + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return cmode != BAQ.CalculationMode.OFF; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + baqHMM.baqRead(read, refReader, cmode, qmode); + return read; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadCigarFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/BadMateFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/CountingFilteringIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/DuplicateReadFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FailsVendorQualityCheckFilter.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java new file mode 100644 index 000000000..90d8a3fd8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.GATKDocUtils; +import org.broadinstitute.gatk.utils.help.HelpConstants; + +import java.util.Collection; +import java.util.List; + +/** + * Manage filters and filter options. Any requests for basic filtering classes + * should ultimately be made through this class. + * + * @author mhanna + * @version 0.1 + */ +public class FilterManager extends PluginManager { + public FilterManager() { + super(ReadFilter.class,"filter","Filter"); + } + + /** + * Instantiate a filter of the given type. Along the way, scream bloody murder if + * the filter is not available. + * @param filterType The type of the filter + * @return The filter + */ + public ReadFilter createFilterByType(Class filterType) { + return this.createByName(getName(filterType)); + } + + public Collection> getValues() { + return this.getPlugins(); + } + + /** + * Rather than use the default error message, print out a list of read filters as well. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return - A wall of text with the default message, followed by a listing of available read filters + */ + @Override + protected String formatErrorMessage(String pluginCategory, String pluginName) { + List> availableFilters = this.getPluginsImplementing(ReadFilter.class); + + + return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, + userFriendlyListofReadFilters(availableFilters), + "Please consult the GATK Documentation (" + HelpConstants.GATK_DOCS_URL + ") for more information."); + } + + /** + * Rather than use the default exception, return a MalformedReadFilterException. + * @param errorMessage error message from formatErrorMessage() + * @return - A MalformedReadFilterException with errorMessage + */ + @Override + protected UserException createMalformedArgumentException(final String errorMessage) { + return new UserException.MalformedReadFilterException(errorMessage); + } + + private String userFriendlyListofReadFilters(List> filters) { + final String headName = "FilterName", headDoc = "Documentation"; + int longestNameLength = -1; + for ( Class < ? extends ReadFilter> filter : filters ) { + longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); + } + String format = " %"+longestNameLength+"s %s%n"; + + StringBuilder listBuilder = new StringBuilder(); + listBuilder.append(String.format(format,headName,headDoc)); + for ( Class filter : filters ) { + String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); + String filterName = this.getName(filter); + listBuilder.append(String.format(format,filterName,helpLink)); + } + + return listBuilder.toString(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/LibraryReadFilter.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java new file mode 100644 index 000000000..6488a857a --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java @@ -0,0 +1,258 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.ReadProperties; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +/** + * Filter out malformed reads. + * + * @author mhanna + * @version 0.1 + */ +public class MalformedReadFilter extends ReadFilter { + + + private static final String FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME = "filter_reads_with_N_cigar" ; + + private SAMFileHeader header; + + @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "filter out reads with CIGAR containing the N operator, instead of stop processing and report an error.", required = false) + boolean filterReadsWithNCigar = false; + + + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false) + boolean filterMismatchingBaseAndQuals = false; + + @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false) + boolean filterBasesNotStored = false; + + /** + * Indicates the applicable validation exclusions + */ + private boolean allowNCigars; + + @Override + public void initialize(final GenomeAnalysisEngine engine) { + header = engine.getSAMFileHeader(); + ValidationExclusion validationExclusions = null; + final SAMDataSource rds = engine.getReadsDataSource(); + if (rds != null) { + final ReadProperties rps = rds.getReadsInfo(); + if (rps != null) { + validationExclusions = rps.getValidationExclusionList(); + } + } + if (validationExclusions == null) { + allowNCigars = false; + } else { + allowNCigars = validationExclusions.contains(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS); + } + } + + public boolean filterOut(final SAMRecord read) { + // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided + return !checkInvalidAlignmentStart(read) || + !checkInvalidAlignmentEnd(read) || + !checkAlignmentDisagreesWithHeader(this.header,read) || + !checkHasReadGroup(read) || + !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) || + !checkCigarDisagreesWithAlignment(read) || + !checkSeqStored(read, filterBasesNotStored) || + !checkCigarIsSupported(read,filterReadsWithNCigar,allowNCigars); + } + + private static boolean checkHasReadGroup(final SAMRecord read) { + if ( read.getReadGroup() == null ) { + // there are 2 possibilities: either the RG tag is missing or it is not defined in the header + final String rgID = (String)read.getAttribute(SAMTagUtil.getSingleton().RG); + if ( rgID == null ) + throw new UserException.ReadMissingReadGroup(read); + throw new UserException.ReadHasUndefinedReadGroup(read, rgID); + } + return true; + } + + /** + * Check for the case in which the alignment start is inconsistent with the read unmapped flag. + * @param read The read to validate. + * @return true if read start is valid, false otherwise. + */ + private static boolean checkInvalidAlignmentStart(final SAMRecord read ) { + // read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) + return false; + // Read is not flagged as 'unmapped', but alignment start is -1 + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 ) + return false; + return true; + } + + /** + * Check for invalid end of alignments. + * @param read The read to validate. + * @return true if read end is valid, false otherwise. + */ + private static boolean checkInvalidAlignmentEnd(final SAMRecord read ) { + // Alignment aligns to negative number of bases in the reference. + if( !read.getReadUnmappedFlag() && read.getAlignmentEnd() != -1 && (read.getAlignmentEnd()-read.getAlignmentStart()+1)<0 ) + return false; + return true; + } + + /** + * Check to ensure that the alignment makes sense based on the contents of the header. + * @param header The SAM file header. + * @param read The read to verify. + * @return true if alignment agrees with header, false othrewise. + */ + private static boolean checkAlignmentDisagreesWithHeader(final SAMFileHeader header, final SAMRecord read ) { + // Read is aligned to nonexistent contig + if( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) + return false; + final SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() ); + // Read is aligned to a point after the end of the contig + if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() ) + return false; + return true; + } + + /** + * Check for inconsistencies between the cigar string and the + * @param read The read to validate. + * @return true if cigar agrees with alignment, false otherwise. + */ + private static boolean checkCigarDisagreesWithAlignment(final SAMRecord read) { + // Read has a valid alignment start, but the CIGAR string is empty + if( !read.getReadUnmappedFlag() && + read.getAlignmentStart() != -1 && + read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START && + read.getAlignmentBlocks().size() < 0 ) + return false; + return true; + } + + /** + * Check for unsupported CIGAR operators. + * Currently the N operator is not supported. + * @param read The read to validate. + * @param filterReadsWithNCigar whether the offending read should just + * be silently filtered or not. + * @param allowNCigars whether reads that contain N operators in their CIGARs + * can be processed or an exception should be thrown instead. + * @throws UserException.UnsupportedCigarOperatorException + * if {@link #filterReadsWithNCigar} is false and + * the input read has some unsupported operation. + * @return true if the read CIGAR operations are + * fully supported, otherwise false, as long as + * no exception has been thrown. + */ + private static boolean checkCigarIsSupported(final SAMRecord read, final boolean filterReadsWithNCigar, final boolean allowNCigars) { + if( containsNOperator(read)) { + if (! filterReadsWithNCigar && !allowNCigars) { + throw new UserException.UnsupportedCigarOperatorException( + CigarOperator.N,read, + "Perhaps you are" + + " trying to use RNA-Seq data?" + + " While we are currently actively working to" + + " support this data type unfortunately the" + + " GATK cannot be used with this data in its" + + " current form. You have the option of either" + + " filtering out all reads with operator " + + CigarOperator.N + " in their CIGAR string" + + " (please add --" + + FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME + + " to your command line) or" + + " assume the risk of processing those reads as they" + + " are including the pertinent unsafe flag (please add -U" + + ' ' + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS + + " to your command line). Notice however that if you were" + + " to choose the latter, an unspecified subset of the" + + " analytical outputs of an unspecified subset of the tools" + + " will become unpredictable. Consequently the GATK team" + + " might well not be able to provide you with the usual support" + + " with any issue regarding any output"); + } + return ! filterReadsWithNCigar; + } + return true; + } + + private static boolean containsNOperator(final SAMRecord read) { + final Cigar cigar = read.getCigar(); + if (cigar == null) { + return false; + } + for (final CigarElement ce : cigar.getCigarElements()) { + if (ce.getOperator() == CigarOperator.N) { + return true; + } + } + return false; + } + + /** + * Check if the read has the same number of bases and base qualities + * @param read the read to validate + * @return true if they have the same number. False otherwise. + */ + private static boolean checkMismatchingBasesAndQuals(final SAMRecord read, final boolean filterMismatchingBaseAndQuals) { + final boolean result; + if (read.getReadLength() == read.getBaseQualities().length) + result = true; + else if (filterMismatchingBaseAndQuals) + result = false; + else + throw new UserException.MalformedBAM(read, + String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals].%s", + read.getReadName(), read.getReadLength(), read.getBaseQualities().length, + read.getBaseQualities().length == 0 ? " You can use --defaultBaseQualities to assign a default base quality for all reads, but this can be dangerous in you don't know what you are doing." : "")); + + return result; + } + + /** + * Check if the read has its base sequence stored + * @param read the read to validate + * @return true if the sequence is stored and false otherwise ("*" in the SEQ field). + */ + protected static boolean checkSeqStored(final SAMRecord read, final boolean filterBasesNotStored) { + + if ( read.getReadBases() != SAMRecord.NULL_SEQUENCE ) + return true; + + if ( filterBasesNotStored ) + return false; + + throw new UserException.MalformedBAM(read, String.format("the BAM file has a read with no stored bases (i.e. it uses '*') which is not supported in the GATK; see the --filter_bases_not_stored argument. Offender: %s", read.getReadName())); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityUnavailableFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MappingQualityZeroFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MateSameStrandFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MaxInsertSizeFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/MissingReadGroupFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NoOriginalQualityScoresFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NotPrimaryAlignmentFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/Platform454Filter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/PlatformUnitFilterHelper.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadLengthFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadNameFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReadStrandFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignMappingQualityFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/ReassignOneMappingQualityFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SampleFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/SingleReadGroupFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/UnmappedReadFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/BySampleSAMFileWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/BySampleSAMFileWriter.java new file mode 100644 index 000000000..6dd8833b8 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/BySampleSAMFileWriter.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMProgramRecord; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: Nov 13 + */ +public class BySampleSAMFileWriter extends NWaySAMFileWriter { + + private final Map sampleToWriterMap; + + public BySampleSAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + super(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, pRecord, keep_records); + + sampleToWriterMap = new HashMap(toolkit.getSAMFileHeader().getReadGroups().size() * 2); + + for (SAMReaderID readerID : toolkit.getReadsDataSource().getReaderIDs()) { + for (SAMReadGroupRecord rg : toolkit.getReadsDataSource().getHeader(readerID).getReadGroups()) { + String sample = rg.getSample(); + if (sampleToWriterMap.containsKey(sample) && sampleToWriterMap.get(sample) != readerID) { + throw new ReviewedGATKException("The same sample appears in multiple files, this input cannot be multiplexed using the BySampleSAMFileWriter, try NWaySAMFileWriter instead."); + } + else { + sampleToWriterMap.put(sample, readerID); + } + } + } + } + + @Override + public void addAlignment(SAMRecord samRecord) { + super.addAlignment(samRecord, sampleToWriterMap.get(samRecord.getReadGroup().getSample())); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/DirectOutputTracker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/FastqFileWriter.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/NWaySAMFileWriter.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/NWaySAMFileWriter.java new file mode 100644 index 000000000..011963ecc --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/NWaySAMFileWriter.java @@ -0,0 +1,257 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io; + +import htsjdk.samtools.*; +import htsjdk.samtools.util.ProgressLoggerInterface; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.text.TextFormattingUtils; + +import java.io.File; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: May 31, 2011 + * Time: 3:52:49 PM + * To change this template use File | Settings | File Templates. + */ +public class NWaySAMFileWriter implements SAMFileWriter { + + private Map writerMap = null; + private boolean presorted ; + GenomeAnalysisEngine toolkit; + boolean KEEP_ALL_PG_RECORDS = false; + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { + this.presorted = presorted; + this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; + writerMap = new HashMap(); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5) { + this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param originalHeader original header + * @param programRecord the program record for this program + */ + public static SAMFileHeader setupWriter(final SAMFileHeader originalHeader, final SAMProgramRecord programRecord) { + final SAMFileHeader header = originalHeader.clone(); + final List oldRecords = header.getProgramRecords(); + final List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId()))) + newRecords.add(record); + + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } + return header; + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns + * the new header to be added to the BAM writer. + * + * @param toolkit the engine + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a pre-filled header for the bam writer + */ + public static SAMFileHeader setupWriter(final GenomeAnalysisEngine toolkit, final SAMFileHeader originalHeader, final Object walker, final String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); + return setupWriter(originalHeader, programRecord); + } + + /** + * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets + * up the writer with the header and presorted status. + * + * @param writer BAM file writer + * @param toolkit the engine + * @param preSorted whether or not the writer can assume reads are going to be added are already sorted + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + */ + public static void setupWriter(GATKSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, Object walker, String PROGRAM_RECORD_NAME) { + SAMFileHeader header = setupWriter(toolkit, originalHeader, walker, PROGRAM_RECORD_NAME); + writer.writeHeader(header); + writer.setPresorted(preSorted); + } + + /** + * Creates a program record (@PG) tag + * + * @param toolkit the engine + * @param walker the walker object (so we can extract the command line) + * @param PROGRAM_RECORD_NAME the name for the PG tag + * @return a program record for the tool + */ + public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); + try { + final String version = headerInfo.getString("org.broadinstitute.gatk.engine.version"); + programRecord.setProgramVersion(version); + } catch (MissingResourceException e) { + // couldn't care less if the resource is missing... + } + programRecord.setCommandLine(toolkit.createApproximateCommandLineArgumentString(toolkit, walker)); + return programRecord; + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The in2out map must contain an entry for each input filename and map it + * onto a unique output file name. + * @param toolkit + * @param in2out + */ + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + if ( in2out==null ) throw new GATKException("input-output bam filename map for n-way-out writing is NULL"); + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + if ( ! in2out.containsKey(fName) ) + throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); + outName = in2out.get(fName); + + if ( writerMap.containsKey( rid ) ) + throw new GATKException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ + "map file likely contains multiple entries for this input file"); + + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + /** + * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved + * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the + * input file name and adding ext instead (e.g. ".cleaned.bam"). + * onto a unique output file name. + * @param toolkit + * @param ext + */ + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { + for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { + + String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); + + String outName; + int pos ; + if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); + else { + if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); + else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); + } + String prefix = fName.substring(0,pos); + outName = prefix+ext; + + if ( writerMap.containsKey( rid ) ) + throw new GATKException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); + } + + } + + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, + boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { + File f = new File(outName); + SAMFileHeader header = setupWriter(toolkit.getSAMFileHeader(id), programRecord); + SAMFileWriterFactory factory = new SAMFileWriterFactory(); + factory.setCreateIndex(indexOnTheFly); + factory.setCreateMd5File(generateMD5); + SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); + writerMap.put(id,sw); + } + + public Collection getWriters() { + return writerMap.values(); + } + + public void addAlignment(SAMRecord samRecord) { + final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); + String rg = samRecord.getStringAttribute("RG"); + if ( rg != null ) { + String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); + samRecord.setAttribute("RG",rg_orig); + } + addAlignment(samRecord, id); + } + + public void addAlignment(SAMRecord samRecord, SAMReaderID readerID) { + writerMap.get(readerID).addAlignment(samRecord); + } + + public SAMFileHeader getFileHeader() { + return toolkit.getSAMFileHeader(); + } + + public void close() { + for ( SAMFileWriter w : writerMap.values() ) w.close(); + } + + @Override + public void setProgressLogger(final ProgressLoggerInterface logger) { + for (final SAMFileWriter writer: writerMap.values()) { + writer.setProgressLogger(logger); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/OutputTracker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/ThreadGroupOutputTracker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/OutputStreamStorage.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/SAMFileWriterStorage.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/Storage.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/StorageFactory.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java new file mode 100644 index 000000000..a54d2ffac --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java @@ -0,0 +1,228 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.storage; + +import htsjdk.samtools.util.BlockCompressedOutputStream; +import org.apache.log4j.Logger; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.vcf.VCFHeader; + +import java.io.*; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.List; + +/** + * Provides temporary and permanent storage for genotypes in VCF format. + * + * @author mhanna + * @version 0.1 + */ +public class VariantContextWriterStorage implements Storage, VariantContextWriter { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class); + + private final static int BUFFER_SIZE = 1048576; + + protected final File file; + protected OutputStream stream; + protected final VariantContextWriter writer; + boolean closed = false; + + /** + * Constructs an object which will write directly into the output file provided by the stub. + * Intentionally delaying the writing of the header -- this should be filled in by the walker. + * + * Respecs the isCompressed() request in stub, so if isCompressed() is true then this + * will create a storage output that dumps output to a BlockCompressedOutputStream. + * + * @param stub Stub to use when constructing the output file. + */ + public VariantContextWriterStorage(VariantContextWriterStub stub) { + if ( stub.getOutputFile() != null ) { + this.file = stub.getOutputFile(); + writer = vcfWriterToFile(stub,stub.getOutputFile(),true,true); + } + else if ( stub.getOutputStream() != null ) { + this.file = null; + this.stream = stub.getOutputStream(); + writer = VariantContextWriterFactory.create(stream, + stub.getMasterSequenceDictionary(), stub.getWriterOptions(false)); + } + else + throw new ReviewedGATKException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); + } + + /** + * Constructs an object which will redirect into a different file. + * + * Note that this function does not respect the isCompressed() request from the stub, in order + * to ensure that tmp. files can be read back in by the Tribble system, and merged with the mergeInto function. + * + * @param stub Stub to use when synthesizing file / header info. + * @param tempFile File into which to direct the output data. + */ + public VariantContextWriterStorage(VariantContextWriterStub stub, File tempFile) { + //logger.debug("Creating temporary output file " + tempFile.getAbsolutePath() + " for VariantContext output."); + this.file = tempFile; + this.writer = vcfWriterToFile(stub, file, false, false); + writer.writeHeader(stub.getVCFHeader()); + } + + /** + * common initialization routine for multiple constructors + * @param stub Stub to use when constructing the output file. + * @param file Target file into which to write VCF records. + * @param indexOnTheFly true to index the file on the fly. NOTE: will be forced to false for compressed files. + * @param allowCompressed if false, we won't compress the output, even if the stub requests it. Critical + * for creating temp. output files that will be subsequently merged, as these do not + * support compressed output + * @return A VCF writer for use with this class + */ + private VariantContextWriter vcfWriterToFile(final VariantContextWriterStub stub, + final File file, + final boolean indexOnTheFly, + final boolean allowCompressed) { + try { + // we cannot merge compressed outputs, so don't compress if allowCompressed is false, + // which is the case when we have a temporary output file for later merging + if ( allowCompressed && stub.isCompressed() ) + stream = new BlockCompressedOutputStream(file); + else + stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); + } + catch(IOException ex) { + throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex); + } + + EnumSet options = stub.getWriterOptions(indexOnTheFly); + VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); + + // if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both + // TODO -- remove me when argument generateShadowBCF is removed + if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) { + final File bcfFile = BCF2Utils.shadowBCF(file); + if ( bcfFile != null ) { + FileOutputStream bcfStream; + try { + bcfStream = new FileOutputStream(bcfFile); + } catch (FileNotFoundException e) { + throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e); + } + + VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); + writer = new TestWriter(writer, bcfWriter); + } + } + + return writer; + } + + private final static class TestWriter implements VariantContextWriter { + final List writers; + + private TestWriter(final VariantContextWriter ... writers) { + this.writers = Arrays.asList(writers); + } + + @Override + public void writeHeader(final VCFHeader header) { + for ( final VariantContextWriter writer : writers ) writer.writeHeader(header); + } + + @Override + public void close() { + for ( final VariantContextWriter writer : writers ) writer.close(); + } + + @Override + public void add(final VariantContext vc) { + for ( final VariantContextWriter writer : writers ) writer.add(vc); + } + } + + public void add(VariantContext vc) { + if ( closed ) throw new ReviewedGATKException("Attempting to write to a closed VariantContextWriterStorage " + vc.getStart() + " storage=" + this); + writer.add(vc); + } + + /** + * initialize this VCF header + * + * @param header the header + */ + public void writeHeader(VCFHeader header) { + writer.writeHeader(header); + } + + /** + * Close the VCF storage object. + */ + public void close() { + writer.close(); + closed = true; + } + + public void mergeInto(VariantContextWriterStorage target) { + try { + if ( ! closed ) + throw new ReviewedGATKException("Writer not closed, but we are merging into the file!"); + final String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin"; + logger.debug(String.format("Merging VariantContextWriterStorage from %s into %s", file.getAbsolutePath(), targetFilePath)); + + // use the feature manager to determine the right codec for the tmp file + // that way we don't assume it's a specific type + final FeatureManager.FeatureDescriptor fd = new FeatureManager().getByFiletype(file); + if ( fd == null ) + throw new UserException.LocalParallelizationProblem(file); + + final FeatureCodec codec = fd.getCodec(); + final AbstractFeatureReader source = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), codec, false); + + for ( final Feature vc : source.iterator() ) { + target.writer.add((VariantContext) vc); + } + + source.close(); + file.delete(); // this should be last to aid in debugging when the process fails + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamArgumentTypeDescriptor.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/OutputStreamStub.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileReaderArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileReaderArgumentTypeDescriptor.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileReaderArgumentTypeDescriptor.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileReaderArgumentTypeDescriptor.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java new file mode 100644 index 000000000..dc4824c08 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.samtools.SAMFileWriter; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.OutputStream; +import java.lang.reflect.Type; + +/** + * Insert a SAMFileWriterStub instead of a full-fledged concrete OutputStream implementations. + */ +public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + /** + * The engine into which output stubs should be fed. + */ + private final GenomeAnalysisEngine engine; + + /** + * The default location to which data should be written if the user specifies no such location. + */ + private final OutputStream defaultOutputStream; + + /** + * Create a new SAMFileWriter argument, notifying the given engine when that argument has been created. + * @param engine Engine to add SAMFileWriter output to. + * @param defaultOutputStream the target for the data + */ + public SAMFileWriterArgumentTypeDescriptor( GenomeAnalysisEngine engine, OutputStream defaultOutputStream ) { + this.engine = engine; + this.defaultOutputStream = defaultOutputStream; + } + + @Override + public boolean supports( Class type ) { + return SAMFileWriter.class.equals(type) || GATKSAMFileWriter.class.equals(type); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { + return !source.isRequired() && source.defaultsToStdout(); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "stdout"; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { + if(source.isRequired() || !source.defaultsToStdout()) + throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); + SAMFileWriterStub stub = new SAMFileWriterStub(engine,defaultOutputStream); + engine.addOutput(stub); + return stub; + } + + @Override + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { + // Extract all possible parameters that could be passed to a BAM file writer? + ArgumentDefinition bamArgumentDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue writerFileName = getArgumentValue( bamArgumentDefinition, matches ); + + // Create the stub + SAMFileWriterStub stub = null; // stub = new SAMFileWriterStub(engine, defaultOutputStream); + + if (writerFileName != null && writerFileName.asFile() != null ) { + stub = new SAMFileWriterStub(engine, writerFileName.asFile()); + + // WARNING: Side effects required by engine! + parsingEngine.addTags(stub,getArgumentTags(matches)); + engine.addOutput(stub); + } + + return stub; + } + +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java new file mode 100644 index 000000000..bc4c0f5d3 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java @@ -0,0 +1,357 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileWriter; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.ProgressLoggerInterface; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; + +/** + * A stub for routing and management of SAM file reading and writing. + * + * @author mhanna + * @version 0.1 + */ +public class SAMFileWriterStub implements Stub, GATKSAMFileWriter { + /** + * Engine to use for collecting attributes for the output SAM file. + */ + private final GenomeAnalysisEngine engine; + + /** + * A header supplied by the user that overrides the merged header from the input BAM. + */ + private SAMFileHeader headerOverride = null; + + /** + * The sam file that this stub should write to. Should be passed along to + * whatever happens to create the StreamConnector. + */ + private final File samFile; + + /** + * The target output stream, to be used in place of the SAM file. + */ + private final OutputStream samOutputStream; + + /** + * The validation stringency to apply when reading this file. + */ + private Integer compressionLevel = null; + + /** + * Should the GATK index the output BAM on-the-fly? + */ + private boolean indexOnTheFly = false; + + /** + * Should the GATK generate an md5 for the output BAM? + */ + private boolean generateMD5 = false; + + /** + * Should this BAM be presorted? + */ + private boolean presorted = true; + + /** + * How many records should the BAM writer store in RAM while + * sorting the BAM on-the-fly? + */ + private Integer maxRecordsInRam = null; + + /** + * Connects this stub with an external stream capable of serving the + * requests of the consumer of this stub. + */ + private OutputTracker outputTracker = null; + + /** + * Has the write started? If so, throw an exception if someone tries to + * change write parameters to the file (compression level, presorted flag, + * header, etc). + */ + private boolean writeStarted = false; + + + /** + * HMM for BAQ, if needed + */ + BAQ baqHMM = new BAQ(); + + /** + * Should we simplify the BAM file while writing it out? + */ + private boolean simplifyBAM = false; + + private List onOutputReadTransformers = null; + + /** + * Create a new stub given the requested SAM file and compression level. + * @param engine source of header data, maybe other data about input files. + * @param samFile SAM file to (ultimately) create. + */ + public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) { + this(engine, samFile, null); + } + + /** + * Create a new stub given the requested SAM file and compression level. + * @param engine source of header data, maybe other data about input files. + * @param stream Output stream to which data should be written. + */ + public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) { + this(engine, null, stream); + } + + private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) { + this.engine = engine; + this.samFile = samFile; + this.samOutputStream = stream; + } + + /** + * Creates a SAMFileWriter using all of the features currently set in the engine (command line arguments, ReadTransformers, etc) + * @param file the filename to write to + * @param engine the engine + * @return a SAMFileWriter with the correct options set + */ + public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine) { + final SAMFileWriterStub output = new SAMFileWriterStub(engine, new File(file)); + output.processArguments(engine.getArguments()); + return output; + } + + /** + * As {@link #createSAMFileWriter(String, org.broadinstitute.gatk.engine.GenomeAnalysisEngine)}, but also sets the header + */ + public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine, final SAMFileHeader header) { + final SAMFileWriterStub output = (SAMFileWriterStub) createSAMFileWriter(file, engine); + output.writeHeader(header); + return output; + } + + /** + * Retrieves the SAM file to (ultimately) be created. + * @return The SAM file. Must not be null. + */ + public File getOutputFile() { + return samFile; + } + + public boolean simplifyBAM() { + return simplifyBAM; + } + + public void setSimplifyBAM(boolean v) { + simplifyBAM = v; + } + + public OutputStream getOutputStream() { + return samOutputStream; + } + + /** + * Retrieves the header to use when creating the new SAM file. + * @return header to use when creating the new SAM file. + */ + public SAMFileHeader getFileHeader() { + return headerOverride != null ? headerOverride : engine.getSAMFileHeader(); + } + + /** + * Retrieves the desired compression level for + * @return The current compression level. Could be null if the user doesn't care. + */ + public Integer getCompressionLevel() { + return compressionLevel; + } + + /** + * Sets the desired compression level. + * @param compressionLevel The suggested compression level. + */ + public void setCompressionLevel( Integer compressionLevel ) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the compression level of a file with alignments already in it."); + this.compressionLevel = compressionLevel; + } + + /** + * Gets whether to index this output stream on-the-fly. + * @return True means create an index. False means skip index creation. + */ + public Boolean getIndexOnTheFly() { + return indexOnTheFly; + } + + /** + * Controls whether to index this output stream on-the-fly. + * @param indexOnTheFly True means create an index. False means skip index creation. + */ + public void setIndexOnTheFly( boolean indexOnTheFly ) { + if(writeStarted) + throw new UserException("Attempted to index a BAM on the fly of a file with alignments already in it."); + this.indexOnTheFly = indexOnTheFly; + } + + /** + * Gets whether to generate an md5 on-the-fly for this BAM. + * @return True generates the md5. False means skip writing the file. + */ + public Boolean getGenerateMD5() { + return generateMD5; + } + + /** + * Gets whether to generate an md5 on-the-fly for this BAM. + * @param generateMD5 True generates the md5. False means skip writing the file. + */ + public void setGenerateMD5(boolean generateMD5) { + if(writeStarted) + throw new UserException("Attempted to turn on md5 generation for BAM file with alignments already in it."); + this.generateMD5 = generateMD5; + } + + /** + * Whether the BAM file to create is actually presorted. + * @return True if the BAM file is presorted. False otherwise. + */ + public boolean isPresorted() { + return this.presorted; + } + + /** + * Set Whether the BAM file to create is actually presorted. + * @param presorted True if the BAM file is presorted. False otherwise. + */ + public void setPresorted(boolean presorted) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the presorted state of a file with alignments already in it."); + this.presorted = presorted; + } + + /** + * Get the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. + * @return Max records in RAM, or null if unset. + */ + public Integer getMaxRecordsInRam() { + return this.maxRecordsInRam; + } + + /** + * Sets the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. + * @param maxRecordsInRam Max number of records in RAM. + */ + public void setMaxRecordsInRam(int maxRecordsInRam) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the max records in RAM of a file with alignments already in it."); + this.maxRecordsInRam = maxRecordsInRam; + } + + /** + * Registers the given streamConnector with this stub. + * @param outputTracker The connector used to provide an appropriate stream. + */ + public void register( OutputTracker outputTracker ) { + this.outputTracker = outputTracker; + } + + @Override + public void processArguments( final GATKArgumentCollection argumentCollection ) { + if (argumentCollection.bamCompression != null) + setCompressionLevel(argumentCollection.bamCompression); + setGenerateMD5(argumentCollection.enableBAMmd5); + setIndexOnTheFly(!argumentCollection.disableBAMIndexing); + setSimplifyBAM(argumentCollection.simplifyBAM); + + } + + /** + * Use the given header as the target for this writer. + * @param header The header to write. + */ + public void writeHeader(SAMFileHeader header) { + if(writeStarted) + throw new ReviewedGATKException("Attempted to change the header of a file with alignments already in it."); + this.headerOverride = header; + } + + private void initializeReadTransformers() { + this.onOutputReadTransformers = new ArrayList<>(engine.getReadTransformers().size()); + for ( final ReadTransformer transformer : engine.getReadTransformers() ) { + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT ) + onOutputReadTransformers.add(transformer); + } + } + + /** + * @{inheritDoc} + */ + public void addAlignment( final SAMRecord readIn ) { + if ( onOutputReadTransformers == null ) + initializeReadTransformers(); + + GATKSAMRecord workingRead = (GATKSAMRecord)readIn; + + // run on output read transformers + for ( final ReadTransformer transform : onOutputReadTransformers ) + workingRead = transform.apply(workingRead); + + writeStarted = true; + outputTracker.getStorage(this).addAlignment(workingRead); + } + + /** + * @{inheritDoc} + */ + public void close() { + outputTracker.getStorage(this).close(); + } + + /** + * @throws java.lang.UnsupportedOperationException No progress logging in this implementation. + */ + @Override + public void setProgressLogger(final ProgressLoggerInterface logger) { + throw new UnsupportedOperationException("Progress logging not supported"); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/Stub.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java new file mode 100644 index 000000000..68163850d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java @@ -0,0 +1,138 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.File; +import java.io.OutputStream; +import java.lang.reflect.Type; +import java.util.Collection; + +/** + * Injects new command-line arguments into the system providing support for the genotype writer. + * + * @author mhanna + * @version 0.1 + */ +public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + /** + * The engine into which output stubs should be fed. + */ + private final GenomeAnalysisEngine engine; + + /** + * The default location to which data should be written if the user specifies no such location. + */ + private final OutputStream defaultOutputStream; + + /** + * The sources into which arguments were injected. + */ + private final Collection argumentSources; + + /** + * Create a new GenotypeWriter argument, notifying the given engine when that argument has been created. + * @param engine the engine to be notified. + * @param defaultOutputStream the default output stream to be written to if nothing else is specified. + * @param argumentSources sources from which command-line arguments should be derived. + */ + public VCFWriterArgumentTypeDescriptor(GenomeAnalysisEngine engine, OutputStream defaultOutputStream, Collection argumentSources) { + this.engine = engine; + this.defaultOutputStream = defaultOutputStream; + this.argumentSources = argumentSources; + } + + /** + * Reports whether this ArgumentTypeDescriptor supports the given type. + * @param type The type to check. + * @return True if the argument is a GenotypeWriter. + */ + @Override + public boolean supports( Class type ) { + return VariantContextWriter.class.equals(type); + } + + /** + * This command-line argument descriptor does want to override the provided default value. + * @return true always. + */ + @Override + public boolean createsTypeDefault(ArgumentSource source) { + return !source.isRequired() && source.defaultsToStdout(); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "stdout"; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { + if(source.isRequired() || !source.defaultsToStdout()) + throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); + VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); + engine.addOutput(stub); + return stub; + } + + /** + * Convert the given argument matches into a single object suitable for feeding into the ArgumentSource. + * @param source Source for this argument. + * @param type not used + * @param matches Matches that match with this argument. + * @return Transform from the matches into the associated argument. + */ + @Override + public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { + ArgumentDefinition defaultArgumentDefinition = createDefaultArgumentDefinition(source); + // Get the filename for the genotype file, if it exists. If not, we'll need to send output to out. + ArgumentMatchValue writerFileName = getArgumentValue(defaultArgumentDefinition,matches); + File writerFile = writerFileName != null ? writerFileName.asFile() : null; + + // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object; + // therefore, the user must have failed to specify a type default + if(writerFile == null && source.isRequired()) + throw new MissingArgumentValueException(defaultArgumentDefinition); + + // Create a stub for the given object. + final VariantContextWriterStub stub = (writerFile != null) + ? new VariantContextWriterStub(engine, writerFile, argumentSources) + : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); + + stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString())); + + // WARNING: Side effects required by engine! + parsingEngine.addTags(stub,getArgumentTags(matches)); + engine.addOutput(stub); + + return stub; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java new file mode 100644 index 000000000..c9202b03c --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java @@ -0,0 +1,301 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.tribble.index.IndexCreator; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.engine.io.OutputTracker; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.Options; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; + +import java.io.File; +import java.io.OutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collection; +import java.util.EnumSet; +import java.util.List; + +/** + * A stub for routing and management of genotype reading and writing. + * + * @author ebanks + * @version 0.1 + */ +public class VariantContextWriterStub implements Stub, VariantContextWriter { + public final static boolean UPDATE_CONTIG_HEADERS = true; + + /** + * The engine, central to the GATK's processing. + */ + private final GenomeAnalysisEngine engine; + + /** + * The file that this stub should write to. Should be mutually + * exclusive with genotypeStream. + */ + private final File genotypeFile; + + /** + * The output stream to which stub data should be written. Will be + * mutually exclusive with genotypeFile. + */ + private final PrintStream genotypeStream; + + /** + * A hack: push the argument sources into the VCF header so that the VCF header + * can rebuild the command-line arguments. + */ + private final Collection argumentSources; + + /** + * Which IndexCreator to use + */ + private final IndexCreator indexCreator; + + /** + * The cached VCF header (initialized to null) + */ + private VCFHeader vcfHeader = null; + + /** + * Should we emit a compressed output stream? + */ + private boolean isCompressed = false; + + /** + * Should the header be written out? A hidden argument. + */ + private boolean skipWritingCommandLineHeader = false; + + /** + * Should we not write genotypes even when provided? + */ + private boolean doNotWriteGenotypes = false; + + /** + * Should we force BCF writing regardless of the file extension? + */ + private boolean forceBCF = false; + + /** + * Should we write all of the fields in the FORMAT field, even if missing fields could be trimmed? + */ + private boolean writeFullFormatField = false; + + /** + * Connects this stub with an external stream capable of serving the + * requests of the consumer of this stub. + */ + protected OutputTracker outputTracker = null; + + /** + * Create a new stub given the requested file. + * + * @param engine engine. + * @param genotypeFile file to (ultimately) create. + * @param argumentSources sources. + */ + public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, Collection argumentSources) { + this.engine = engine; + this.genotypeFile = genotypeFile; + this.genotypeStream = null; + this.indexCreator = GATKVCFUtils.getIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, genotypeFile); + this.argumentSources = argumentSources; + } + + /** + * Create a new stub given the requested file. + * + * @param engine engine. + * @param genotypeStream stream to (ultimately) write. + * @param argumentSources sources. + */ + public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, Collection argumentSources) { + this.engine = engine; + this.genotypeFile = null; + this.genotypeStream = new PrintStream(genotypeStream); + this.indexCreator = null; + this.argumentSources = argumentSources; + } + + /** + * Retrieves the file to (ultimately) be created. + * @return The file. Can be null if genotypeStream is not. + */ + public File getOutputFile() { + return genotypeFile; + } + + /** + * Retrieves the output stream to which to (ultimately) write. + * @return The file. Can be null if genotypeFile is not. + */ + public OutputStream getOutputStream() { + return genotypeStream; + } + + public boolean isCompressed() { + return isCompressed; + } + + public void setCompressed(final boolean compressed) { + isCompressed = compressed; + } + + public void setSkipWritingCommandLineHeader(final boolean skipWritingCommandLineHeader) { + this.skipWritingCommandLineHeader = skipWritingCommandLineHeader; + } + + public void setDoNotWriteGenotypes(final boolean doNotWriteGenotypes) { + this.doNotWriteGenotypes = doNotWriteGenotypes; + } + + public void setForceBCF(final boolean forceBCF) { + this.forceBCF = forceBCF; + } + + public void setWriteFullFormatField(final boolean writeFullFormatField) { + this.writeFullFormatField = writeFullFormatField; + } + + public IndexCreator getIndexCreator() { + return indexCreator; + } + + /** + * Gets the master sequence dictionary from the engine associated with this stub + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return the master sequence dictionary from the engine associated with this stub + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return engine.getMasterSequenceDictionary(); + } + + public EnumSet getWriterOptions() { + return getWriterOptions(false); + } + + public EnumSet getWriterOptions(boolean indexOnTheFly) { + final List options = new ArrayList<>(); + + if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES); + if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); + if ( indexOnTheFly) options.add(Options.INDEX_ON_THE_FLY); + if ( writeFullFormatField ) options.add(Options.WRITE_FULL_FORMAT_FIELD); + + if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) ) + options.add(Options.FORCE_BCF); + + return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); + } + + /** + * Retrieves the header to use when creating the new file. + * @return header to use when creating the new file. + */ + public VCFHeader getVCFHeader() { + return vcfHeader; + } + + /** + * Registers the given streamConnector with this stub. + * @param outputTracker The connector used to provide an appropriate stream. + */ + public void register( OutputTracker outputTracker ) { + this.outputTracker = outputTracker; + } + + @Override + public void processArguments( final GATKArgumentCollection argumentCollection ) { + setDoNotWriteGenotypes(argumentCollection.sitesOnlyVCF); + setSkipWritingCommandLineHeader(argumentCollection.disableCommandLineInVCF); + setForceBCF(argumentCollection.forceBCFOutput); + setWriteFullFormatField(argumentCollection.neverTrimVCFFormatField); + } + + public void writeHeader(VCFHeader header) { + vcfHeader = header; + + if ( header.isWriteEngineHeaders() ) { + // skip writing the command line header if requested + if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) { + // Always add the header line, as the current format allows multiple entries + final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(engine, argumentSources); + vcfHeader.addMetaDataLine(commandLineArgHeaderLine); + } + + if ( UPDATE_CONTIG_HEADERS ) + vcfHeader = GATKVCFUtils.withUpdatedContigs(vcfHeader, engine); + } + + outputTracker.getStorage(this).writeHeader(vcfHeader); + } + + /** + * @{inheritDoc} + */ + public void add(VariantContext vc) { + outputTracker.getStorage(this).add(vc); + } + + /** + * @{inheritDoc} + */ + public void close() { + outputTracker.getStorage(this).close(); + } + + /** + * Gets a string representation of this object. + * @return a string representation of this object. + */ + @Override + public String toString() { + return getClass().getName(); + } + + /** + * Should we also write a BCF file alongside our VCF file for testing + * + * TODO -- remove me when argument generateShadowBCF is removed + * + * @return + */ + public boolean alsoWriteBCFForTest() { + return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded + ! isCompressed() && // for non-compressed outputs + getOutputFile() != null && // that are going to disk + engine.getArguments().generateShadowBCF; // and we actually want to do it + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java new file mode 100644 index 000000000..ecce811f9 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java @@ -0,0 +1,160 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.MergingSamRecordIterator; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Iterator; + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 14, 2009 + *

+ * Class BoundedReadIterator + *

+ * This class implements a read iterator that is bounded by the number of reads + * it will produce over the iteration. + */ +public class BoundedReadIterator implements GATKSAMIterator { + + // the genome loc we're bounding + final private long readCount; + private long currentCount = 0; + + // the iterator we want to decorate + private final GATKSAMIterator iterator; + + // our unmapped read flag + private boolean doNotUseThatUnmappedReadPile = false; + + /** + * The next read that we've buffered. Null indicates that there's + * nothing in the buffer (not that there isn't a next read). + */ + private SAMRecord record = null; + + /** + * constructor + * @param iter + * @param readCount + */ + public BoundedReadIterator(GATKSAMIterator iter, long readCount) { + this.iterator = iter; + this.readCount = readCount; + } + + public void useUnmappedReads(boolean useThem) { + this.doNotUseThatUnmappedReadPile = useThem; + } + + public SAMFileHeader getHeader() { + // todo: this is bad, we need an iterface out there for samrecords that supports getting the header, + // regardless of the merging + if (iterator instanceof MergingSamRecordIterator) + return ((MergingSamRecordIterator)iterator).getMergedHeader(); + else + return null; + } + + /** + * Do we have a next? If the iterator has a read and we're not over the read + * count, then yes + * @return + */ + public boolean hasNext() { + if( record != null ) + return true; + + if (iterator.hasNext() && currentCount < readCount) { + record = iterator.next(); + ++currentCount; + if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) { + return false; + } + return true; + } else { + return false; + } + } + + /** + * get the next SAMRecord + * @return SAMRecord representing the next read + */ + public SAMRecord next() { + SAMRecord cached = record; + record = null; + return cached; + } + + /** + * this is unsupported on SAMRecord iterators + */ + public void remove() { + throw new UnsupportedOperationException("You cannot use an iterator to remove a SAMRecord"); + } + + /** + * close the iterator + */ + public void close() { + iterator.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/GenomeLocusIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/IterableIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MalformedBAMErrorReformatingIterator.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityReadTransformer.java new file mode 100644 index 000000000..ea2e081c7 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityReadTransformer.java @@ -0,0 +1,94 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * Checks for and errors out (or fixes if requested) when it detects reads with base qualities that are not encoded with + * phred-scaled quality scores. Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at + * Q64. The idea here is simple: if we are asked to fix the scores then we just subtract 31 from every quality score. + * Otherwise, we randomly sample reads (for efficiency) and error out if we encounter a qual that's too high. + */ +public class MisencodedBaseQualityReadTransformer extends ReadTransformer { + + private static final int samplingFrequency = 1000; // sample 1 read for every 1000 encountered + private static final int encodingFixValue = 31; // Illumina_64 - PHRED_33 + + private boolean disabled; + private boolean fixQuals; + protected static int currentReadCounter = 0; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + fixQuals = engine.getArguments().FIX_MISENCODED_QUALS; + disabled = !fixQuals && engine.getArguments().ALLOW_POTENTIALLY_MISENCODED_QUALS; + + return ReadTransformer.ApplicationTime.ON_INPUT; + } + + @Override + public boolean enabled() { + return !disabled; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + if ( fixQuals ) + return fixMisencodedQuals(read); + + checkForMisencodedQuals(read); + return read; + } + + protected static GATKSAMRecord fixMisencodedQuals(final GATKSAMRecord read) { + final byte[] quals = read.getBaseQualities(); + for ( int i = 0; i < quals.length; i++ ) { + quals[i] -= encodingFixValue; + if ( quals[i] < 0 ) + throw new UserException.BadInput("while fixing mis-encoded base qualities we encountered a read that was correctly encoded; we cannot handle such a mixture of reads so unfortunately the BAM must be fixed with some other tool"); + } + read.setBaseQualities(quals); + return read; + } + + protected static void checkForMisencodedQuals(final GATKSAMRecord read) { + // sample reads randomly for checking + if ( ++currentReadCounter >= samplingFrequency ) { + currentReadCounter = 0; + + final byte[] quals = read.getBaseQualities(); + for ( final byte qual : quals ) { + if ( qual > QualityUtils.MAX_REASONABLE_Q_SCORE ) + throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score of " + (int)qual); + } + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java new file mode 100644 index 000000000..ca53fcf1d --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Iterator; +import java.util.NoSuchElementException; +/** + * User: hanna + * Date: May 19, 2009 + * Time: 6:47:16 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * A placeholder for an iterator with no data. + */ +public class NullSAMIterator implements GATKSAMIterator { + public NullSAMIterator() {} + + public Iterator iterator() { return this; } + public void close() { /* NO-OP */ } + + public boolean hasNext() { return false; } + public SAMRecord next() { throw new NoSuchElementException("No next element is available."); } + public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PeekingIterator.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java new file mode 100644 index 000000000..a79d592f7 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java @@ -0,0 +1,106 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +/** + * Iterates through a list of elements, tracking the number of elements it has seen. + * @author hanna + * @version 0.1 + */ +public class PositionTrackingIterator implements GATKSAMIterator { + /** + * The iterator being tracked. + */ + private CloseableIterator iterator; + + /** + * Current position within the tracked iterator. + */ + private long position; + + /** + * Retrieves the current position of the iterator. The 'current position' of the iterator is defined as + * the coordinate of the read that will be returned if next() is called. + * @return The current position of the iterator. + */ + public long getPosition() { + return position; + } + + /** + * Create a new iterator wrapping the given position, assuming that the reader is position reads + * into the sequence. + * @param iterator Iterator to wraps. + * @param position Non-negative position where the iterator currently sits. + */ + public PositionTrackingIterator(CloseableIterator iterator, long position ) { + this.iterator = iterator; + this.position = position; + } + + /** + * {@inheritDoc} + */ + public boolean hasNext() { + return iterator.hasNext(); + } + + /** + * Try to get the next read in the list. If a next read is available, increment the position. + * @return next read in the list, if available. + */ + public SAMRecord next() { + try { + return iterator.next(); + } + finally { + position++; + } + } + + /** + * {@inheritDoc} + */ + public GATKSAMIterator iterator() { + return this; + } + + /** + * {@inheritDoc} + */ + public void close() { + iterator.close(); + } + + /** + * {@inheritDoc} + */ + public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/RNAReadTransformer.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java new file mode 100644 index 000000000..7a3ca935f --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java @@ -0,0 +1,141 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; + +/** + * An iterator which does post-processing of a read, including potentially wrapping + * the read in something with a compatible interface or replacing the read entirely. + * + * @author mhanna + * @version 0.1 + */ +public class ReadFormattingIterator implements GATKSAMIterator { + /** + * Logger. + */ + final protected static Logger logger = Logger.getLogger(ReadFormattingIterator.class); + + /** + * Iterator to which to pass + */ + private GATKSAMIterator wrappedIterator; + + /** + * True if original base qualities should be used. + */ + private final boolean useOriginalBaseQualities; + + /** + * Positive if there is a default Base Quality value to fill in the reads with. + */ + private final byte defaultBaseQualities; + + + /** + * Decorate the given iterator inside a ReadWrappingIterator. + * @param wrappedIterator iterator + * @param useOriginalBaseQualities true if original base qualities should be used + * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. + */ + public ReadFormattingIterator(GATKSAMIterator wrappedIterator, boolean useOriginalBaseQualities, byte defaultBaseQualities) { + this.wrappedIterator = wrappedIterator; + this.useOriginalBaseQualities = useOriginalBaseQualities; + this.defaultBaseQualities = defaultBaseQualities; + + } + + /** + * Convenience function for use in foreach loops. Dangerous because it does not actually + * reset the iterator. + * @return An iterator through the current data stream. + */ + public GATKSAMIterator iterator() { + // NOTE: this iterator doesn't perform any kind of reset operation; it just returns itself. + // can we do something better? Do we really have to provide support for the Iterable interface? + return this; + } + + /** + * Close this iterator. + */ + public void close() { + wrappedIterator.close(); + } + + /** + * Does the iterator contain more values? + * @return True if there are more left to return, false otherwise. + */ + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + /** + * Get the next value in the sequence. + * @return Next value in the sequence. By convention, a NoSuchElementException should be thrown if + * no next exists. + */ + public SAMRecord next() { + SAMRecord rec = wrappedIterator.next(); + + // Always consolidate the cigar string into canonical form, collapsing zero-length / repeated cigar elements. + // Downstream code (like LocusIteratorByState) cannot necessarily handle non-consolidated cigar strings. + rec.setCigar(AlignmentUtils.consolidateCigar(rec.getCigar())); + + // if we are using default quals, check if we need them, and add if necessary. + // 1. we need if reads are lacking or have incomplete quality scores + // 2. we add if defaultBaseQualities has a positive value + if (defaultBaseQualities >= 0) { + byte reads [] = rec.getReadBases(); + byte quals [] = rec.getBaseQualities(); + if (quals == null || quals.length < reads.length) { + byte new_quals [] = new byte [reads.length]; + for (int i=0; i iterator() { return this; } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIterator.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIterator.java new file mode 100644 index 000000000..8721779bf --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIterator.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Iterator; + +/** + * Verifies that the incoming stream of reads is correctly sorted + */ +public class VerifyingSamIterator implements GATKSAMIterator { + GATKSAMIterator it; + SAMRecord last = null; + boolean checkOrderP = true; + + public VerifyingSamIterator(GATKSAMIterator it) { + this.it = it; + } + + public boolean hasNext() { return this.it.hasNext(); } + public SAMRecord next() { + + SAMRecord cur = it.next(); + if ( last != null ) + verifyRecord(last, cur); + if ( ! cur.getReadUnmappedFlag() ) + last = cur; + return cur; + } + + private void verifyRecord( final SAMRecord last, final SAMRecord cur ) { + if ( checkOrderP && isOutOfOrder(last, cur) ) { + this.last = null; + throw new UserException.MissortedBAM(String.format("reads are out of order:%nlast:%n%s%ncurrent:%n%s%n", last.format(), cur.format()) ); + } + } + + private boolean isOutOfOrder( final SAMRecord last, final SAMRecord cur ) { + if ( last == null || cur.getReadUnmappedFlag() ) + return false; + else { + if(last.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || last.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) + throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",last.format())); + if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START) + throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format())); + + return (last.getReferenceIndex() > cur.getReferenceIndex()) || + (last.getReferenceIndex().equals(cur.getReferenceIndex()) && + last.getAlignmentStart() > cur.getAlignmentStart()); + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + it.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/iterators/package-info.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java new file mode 100644 index 000000000..3bd174442 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java @@ -0,0 +1,786 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.phonehome; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.io.IOUtils; +import org.broadinstitute.gatk.utils.io.Resource; +import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; +import org.jets3t.service.S3Service; +import org.jets3t.service.S3ServiceException; +import org.jets3t.service.impl.rest.httpclient.RestS3Service; +import org.jets3t.service.model.S3Object; +import org.jets3t.service.security.AWSCredentials; +import org.simpleframework.xml.Element; +import org.simpleframework.xml.Serializer; +import org.simpleframework.xml.core.Persister; + +import java.io.*; +import java.security.NoSuchAlgorithmException; +import java.security.PublicKey; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.Arrays; +import java.util.Date; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + + +/** + * A detailed description of a GATK run, and error if applicable. Simply create a GATKRunReport + * with the constructor, providing the walker that was run and the fully instantiated GenomeAnalysisEngine + * after the run finishes and the GATKRunReport will collect all of the report information + * into this object. Call postReport to write out the report, as an XML document, to either STDOUT, + * a file (in which case the output is gzipped), or with no arguments the report will be posted to the + * GATK run report database. + * + * @author depristo + * @since 2010 + */ +public class GATKRunReport { + protected static final String REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports"; + protected static final String TEST_REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports.test"; + protected final static String AWS_ACCESS_KEY_MD5 = "34d4a26eb2062b3f06e833b28f9a38c6"; + protected final static String AWS_SECRET_KEY_MD5 = "83f2332eec99ef1d7425d5dc5d4b514a"; + + private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss"); + + /** + * our log + */ + protected static final Logger logger = Logger.getLogger(GATKRunReport.class); + + /** + * Default value for the number of milliseconds before an S3 put operation is timed-out. + * Can be overridden via a constructor argument. + */ + private static final long S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS = 30 * 1000; + + /** + * Number of milliseconds before an S3 put operation is timed-out. + */ + private long s3PutTimeOutInMilliseconds = S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS; + + // ----------------------------------------------------------------- + // elements captured for the report + // ----------------------------------------------------------------- + + @Element(required = false, name = "id") + private String id; + + @Element(required = false, name = "exception") + private GATKRunReportException mException; + + @Element(required = true, name = "start-time") + private String startTime = "ND"; + + @Element(required = true, name = "end-time") + private String endTime; + + @Element(required = true, name = "run-time") + private long runTime = 0; + + @Element(required = true, name = "walker-name") + private String walkerName; + + @Element(required = true, name = "svn-version") + private String svnVersion; + + @Element(required = true, name = "total-memory") + private long totalMemory; + + @Element(required = true, name = "max-memory") + private long maxMemory; + + @Element(required = true, name = "user-name") + private String userName; + + @Element(required = true, name = "host-name") + private String hostName; + + @Element(required = true, name = "java") + private String javaVersion; + + @Element(required = true, name = "machine") + private String machine; + + @Element(required = true, name = "iterations") + private long nIterations; + + @Element(required = true, name = "tag") + private String tag; + + @Element(required = true, name = "num-threads") + private int numThreads; + @Element(required = true, name = "percent-time-running") + private String percentTimeRunning; + @Element(required = true, name = "percent-time-waiting") + private String percentTimeWaiting; + @Element(required = true, name = "percent-time-blocking") + private String percentTimeBlocking; + @Element(required = true, name = "percent-time-waiting-for-io") + private String percentTimeWaitingForIO; + + /** The error message, if one occurred, or null if none did */ + public String errorMessage = null; + /** The error that occurred, if one did, or null if none did */ + public Throwable errorThrown = null; + + /** + * How should the GATK report its usage? + */ + public enum PhoneHomeOption { + /** Disable phone home */ + NO_ET, + /** Forces the report to go to S3 */ + AWS, + /** Force output to STDOUT. For debugging only */ + STDOUT + } + + /** + * To allow us to deserial reports from XML + */ + private GATKRunReport() { } + + /** + * Read a GATKRunReport from the serialized XML representation in String reportAsXML + * @param stream an input stream containing a serialized XML report + * @return a reconstituted GATKRunReport from reportAsXML + * @throws Exception if parsing fails for any reason + */ + @Ensures("result != null") + protected static GATKRunReport deserializeReport(final InputStream stream) throws Exception { + final Serializer serializer = new Persister(); + return serializer.read(GATKRunReport.class, stream); + } + + /** + * Create a new GATKRunReport from a report on S3 + * + * Assumes that s3Object has already been written to S3, and this function merely + * fetches it from S3 and deserializes it. The access keys must have permission to + * GetObject from S3. + * + * @param downloaderAccessKey AWS access key with permission to GetObject from bucketName + * @param downloaderSecretKey AWS secret key with permission to GetObject from bucketName + * @param bucketName the name of the bucket holding the report + * @param s3Object the s3Object we wrote to S3 in bucketName that we want to get back and decode + * @return a deserialized report derived from s3://bucketName/s3Object.getName() + * @throws Exception + */ + @Ensures("result != null") + protected static GATKRunReport deserializeReport(final String downloaderAccessKey, + final String downloaderSecretKey, + final String bucketName, + final S3Object s3Object) throws Exception { + final S3Service s3Service = initializeAWSService(downloaderAccessKey, downloaderSecretKey); + + // Retrieve the whole data object we created previously + final S3Object objectComplete = s3Service.getObject(bucketName, s3Object.getName()); + + // Read the data from the object's DataInputStream using a loop, and print it out. + return deserializeReport(new GZIPInputStream(objectComplete.getDataInputStream())); + } + + /** + * Create a new RunReport and population all of the fields with values from the walker and engine. + * Allows the S3 put timeout to be explicitly set. + * + * @param walker the GATK walker that we ran + * @param e the exception caused by running this walker, or null if we completed successfully + * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc + * @param type the GATK phone home setting + * @param s3PutTimeOutInMilliseconds number of milliseconds to wait before timing out an S3 put operation + */ + public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type, + final long s3PutTimeOutInMilliseconds) { + this(walker, e, engine, type); + this.s3PutTimeOutInMilliseconds = s3PutTimeOutInMilliseconds; + } + + /** + * Create a new RunReport and population all of the fields with values from the walker and engine. + * Leaves the S3 put timeout set to the default value of S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS. + * + * @param walker the GATK walker that we ran + * @param e the exception caused by running this walker, or null if we completed successfully + * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc + * @param type the GATK phone home setting + */ + public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type) { + if ( type == PhoneHomeOption.NO_ET ) + throw new ReviewedGATKException("Trying to create a run report when type is NO_ET!"); + + logger.debug("Aggregating data for run report"); + + // what did we run? + id = org.apache.commons.lang.RandomStringUtils.randomAlphanumeric(32); + walkerName = engine.getWalkerName(walker.getClass()); + svnVersion = CommandLineGATK.getVersionNumber(); + + // runtime performance metrics + Date end = new java.util.Date(); + endTime = DATE_FORMAT.format(end); + if ( engine.getStartTime() != null ) { // made it this far during initialization + startTime = DATE_FORMAT.format(engine.getStartTime()); + runTime = (end.getTime() - engine.getStartTime().getTime()) / 1000L; // difference in seconds + } + + // deal with memory usage + Runtime.getRuntime().gc(); // call GC so totalMemory is ~ used memory + maxMemory = Runtime.getRuntime().maxMemory(); + totalMemory = Runtime.getRuntime().totalMemory(); + + // we can only do some operations if an error hasn't occurred + if ( engine.getCumulativeMetrics() != null ) { + // it's possible we aborted so early that these data structures arent initialized + nIterations = engine.getCumulativeMetrics().getNumIterations(); + } + + tag = engine.getArguments().tag; + + // user and hostname -- information about the runner of the GATK + userName = System.getProperty("user.name"); + hostName = Utils.resolveHostname(); + + // basic java information + javaVersion = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version"))); + machine = Utils.join("-", Arrays.asList(System.getProperty("os.name"), System.getProperty("os.arch"))); + + // if there was an exception, capture it + this.mException = e == null ? null : new GATKRunReportException(e); + + numThreads = engine.getTotalNumberOfThreads(); + percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); + percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); + percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); + percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO); + } + + /** + * Get the random alpha-numeric ID of this GATKRunReport + * @return a non-null string ID + */ + @Ensures("result != null") + public String getID() { + return id; + } + + /** + * Return a string representing the percent of time the GATK spent in state, if possible. Otherwise return NA + * + * @param engine the GATK engine whose threading efficiency info we will use + * @param state the state whose occupancy we wish to know + * @return a string representation of the percent occupancy of state, or NA is not possible + */ + @Requires({"engine != null", "state != null"}) + @Ensures("result != null") + private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) { + final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor(); + return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state)); + } + + /** + * Get a filename (no path) appropriate for this report + * + * @return a non-null string filename + */ + @Ensures("result != null") + protected String getReportFileName() { + return getID() + ".report.xml.gz"; + } + + // --------------------------------------------------------------------------- + // + // Main public interface method for posting reports + // + // --------------------------------------------------------------------------- + + /** + * Post this GATK report to the destination implied by the PhoneHomeOption type + * + * Guaranteed to never throw an exception (exception noted below) and to return + * with a reasonable (~10 seconds) time regardless of successful writing of the report. + * + * @throws IllegalArgumentException if type == null + * @param type the type of phoning home we want to do + * @return true if a report was successfully written, false otherwise + */ + public boolean postReport(final PhoneHomeOption type) { + if ( type == null ) throw new IllegalArgumentException("type cannot be null"); + + logger.debug("Posting report of type " + type); + switch (type) { + case NO_ET: // don't do anything + return false; + case AWS: + wentToAWS = true; + return postReportToAWSS3() != null; + case STDOUT: + return postReportToStream(System.out); + default: + exceptDuringRunReport("BUG: unexpected PhoneHomeOption "); + return false; + } + } + + // --------------------------------------------------------------------------- + // + // Code for sending reports to local files + // + // --------------------------------------------------------------------------- + + /** + * Write an XML representation of this report to the stream, throwing a GATKException if the marshalling + * fails for any reason. + * + * @param stream an output stream to write the report to + */ + @Requires("stream != null") + protected boolean postReportToStream(final OutputStream stream) { + final Serializer serializer = new Persister(); + try { + serializer.write(this, stream); + return true; + } catch (Exception e) { + return false; + } + } + + // --------------------------------------------------------------------------- + // + // Code for sending reports to s3 + // + // --------------------------------------------------------------------------- + + /** + * Get the name of the S3 bucket where we should upload this report + * + * @return the string name of the s3 bucket + */ + @Ensures("result != null") + protected String getS3ReportBucket() { + return s3ReportBucket; + } + + /** + * Decrypts encrypted AWS key from encryptedKeySource + * @param encryptedKeySource a file containing an encrypted AWS key + * @return a decrypted AWS key as a String + */ + @Ensures("result != null") + public static String decryptAWSKey(final File encryptedKeySource) throws FileNotFoundException { + if ( encryptedKeySource == null ) throw new IllegalArgumentException("encryptedKeySource cannot be null"); + return decryptAWSKey(new FileInputStream(encryptedKeySource)); + } + + /** + * @see #decryptAWSKey(java.io.File) but with input from an inputstream + */ + @Requires("encryptedKeySource != null") + @Ensures("result != null") + private static String decryptAWSKey(final InputStream encryptedKeySource) { + final PublicKey key = CryptUtils.loadGATKDistributedPublicKey(); + final byte[] fromDisk = IOUtils.readStreamIntoByteArray(encryptedKeySource); + final byte[] decrypted = CryptUtils.decryptData(fromDisk, key); + return new String(decrypted); + } + + /** + * Get the decrypted AWS key sorted in the resource directories of name + * @param name the name of the file containing the needed AWS key + * @return a non-null GATK + */ + @Requires("name != null") + @Ensures("result != null") + private static String getAWSKey(final String name) { + final Resource resource = new Resource(name, GATKRunReport.class); + return decryptAWSKey(resource.getResourceContentsAsStream()); + } + + /** + * Get the AWS access key for the GATK user + * @return a non-null AWS access key for the GATK user + */ + @Ensures("result != null") + protected static String getAWSUploadAccessKey() { + return getAWSKey("resources/GATK_AWS_access.key"); + } + + /** + * Get the AWS secret key for the GATK user + * @return a non-null AWS secret key for the GATK user + */ + @Ensures("result != null") + protected static String getAWSUploadSecretKey() { + return getAWSKey("resources/GATK_AWS_secret.key"); + } + + /** + * Check that the AWS keys can be decrypted and are what we expect them to be + * + * @throws ReviewedGATKException if anything goes wrong + */ + public static void checkAWSAreValid() { + try { + final String accessKeyMD5 = Utils.calcMD5(getAWSUploadAccessKey()); + final String secretKeyMD5 = Utils.calcMD5(getAWSUploadSecretKey()); + + if ( ! AWS_ACCESS_KEY_MD5.equals(accessKeyMD5) ) { + throw new ReviewedGATKException("Invalid AWS access key found, expected MD5 " + AWS_ACCESS_KEY_MD5 + " but got " + accessKeyMD5); + } + if ( ! AWS_SECRET_KEY_MD5.equals(secretKeyMD5) ) { + throw new ReviewedGATKException("Invalid AWS secret key found, expected MD5 " + AWS_SECRET_KEY_MD5 + " but got " + secretKeyMD5); + } + + } catch ( Exception e ) { + throw new ReviewedGATKException("Couldn't decrypt AWS keys, something is wrong with the GATK distribution"); + } + } + + /** + * Get an initialized S3Service for use in communicating with AWS/s3 + * + * @param awsAccessKey our AWS access key to use + * @param awsSecretKey our AWS secret key to use + * @return an initialized S3Service object that can be immediately used to interact with S3 + * @throws S3ServiceException + */ + @Requires({"awsAccessKey != null", "awsSecretKey != null"}) + @Ensures("result != null") + protected static S3Service initializeAWSService(final String awsAccessKey, final String awsSecretKey) throws S3ServiceException { + // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP + // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. + final AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); + return new RestS3Service(awsCredentials); + } + + /** + * A runnable that pushes this GATKReport up to s3. + * + * Should be run in a separate thread so we can time it out if something is taking too long + */ + private class S3PutRunnable implements Runnable { + /** Was the upload operation successful? */ + public final AtomicBoolean isSuccess; + /** The name of this report */ + private final String filename; + /** The contents of this report */ + private final byte[] contents; + + /** The s3Object that we created to upload, or null if it failed */ + public S3Object s3Object = null; + + @Requires({"filename != null", "contents != null"}) + public S3PutRunnable(final String filename, final byte[] contents){ + this.isSuccess = new AtomicBoolean(); + this.filename = filename; + this.contents = contents; + } + + public void run() { + try { + switch ( awsMode ) { + case FAIL_WITH_EXCEPTION: + throw new IllegalStateException("We are throwing an exception for testing purposes"); + case TIMEOUT: + try { + Thread.sleep(s3PutTimeOutInMilliseconds * 100); + } catch ( InterruptedException e ) { + // supposed to be empty + } + break; + case NORMAL: + // IAM GATK user credentials -- only right is to PutObject into broad.gsa.gatk.run.reports bucket + final S3Service s3Service = initializeAWSService(getAWSUploadAccessKey(), getAWSUploadSecretKey()); + + // Create an S3Object based on a file, with Content-Length set automatically and + // Content-Type set based on the file's extension (using the Mimetypes utility class) + final S3Object fileObject = new S3Object(filename, contents); + //logger.info("Created S3Object" + fileObject); + //logger.info("Uploading " + localFile + " to AWS bucket"); + s3Object = s3Service.putObject(getS3ReportBucket(), fileObject); + isSuccess.set(true); + break; + default: + throw new IllegalStateException("Unexpected AWS exception"); + } + } catch ( S3ServiceException e ) { + exceptDuringRunReport("S3 exception occurred", e); + } catch ( NoSuchAlgorithmException e ) { + exceptDuringRunReport("Couldn't calculate MD5", e); + } catch ( IOException e ) { + exceptDuringRunReport("Couldn't read report file", e); + } catch ( Exception e ) { + exceptDuringRunReport("An unexpected exception occurred during posting", e); + } + } + } + + /** + * Post this GATK report to the AWS s3 GATK_Run_Report log + * + * @return the s3Object pointing to our pushed report, or null if we failed to push + */ + protected S3Object postReportToAWSS3() { + // modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html + this.hostName = Utils.resolveHostname(); // we want to fill in the host name + final String key = getReportFileName(); + logger.debug("Generating GATK report to AWS S3 with key " + key); + + try { + // create an byte output stream so we can capture the output as a byte[] + final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096); + final OutputStream outputStream = new GZIPOutputStream(byteStream); + postReportToStream(outputStream); + outputStream.close(); + final byte[] report = byteStream.toByteArray(); + + // stop us from printing the annoying, and meaningless, mime types warning + final Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); + mimeTypeLogger.setLevel(Level.FATAL); + + // Set the S3 upload on its own thread with timeout: + final S3PutRunnable s3run = new S3PutRunnable(key,report); + final Thread s3thread = new Thread(s3run); + s3thread.setDaemon(true); + s3thread.setName("S3Put-Thread"); + s3thread.start(); + + s3thread.join(s3PutTimeOutInMilliseconds); + + if(s3thread.isAlive()){ + s3thread.interrupt(); + exceptDuringRunReport("Run statistics report upload to AWS S3 timed-out"); + } else if(s3run.isSuccess.get()) { + logger.info("Uploaded run statistics report to AWS S3"); + logger.debug("Uploaded to AWS: " + s3run.s3Object); + return s3run.s3Object; + } else { + // an exception occurred, the thread should have already invoked the exceptDuringRunReport function + } + } catch ( IOException e ) { + exceptDuringRunReport("Couldn't read report file", e); + } catch ( InterruptedException e) { + exceptDuringRunReport("Run statistics report upload interrupted", e); + } + + return null; + } + + // --------------------------------------------------------------------------- + // + // Error handling code + // + // --------------------------------------------------------------------------- + + /** + * Note that an exception occurred during creating or writing this report + * @param msg the message to print + * @param e the exception that occurred + */ + @Ensures("exceptionOccurredDuringPost()") + private void exceptDuringRunReport(final String msg, final Throwable e) { + this.errorMessage = msg; + this.errorThrown = e; + logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is: " + msg + ". Error message is: " + e.getMessage()); + } + + /** + * Note that an exception occurred during creating or writing this report + * @param msg the message to print + */ + @Ensures("exceptionOccurredDuringPost()") + private void exceptDuringRunReport(final String msg) { + this.errorMessage = msg; + logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is " + msg); + } + + /** + * Did an error occur during the posting of this run report? + * @return true if so, false if not + */ + public boolean exceptionOccurredDuringPost() { + return getErrorMessage() != null; + } + + /** + * If an error occurred during posting of this report, retrieve the message of the error that occurred, or null if + * no error occurred + * @return a string describing the error that occurred, or null if none did + */ + public String getErrorMessage() { + return errorMessage; + } + + /** + * Get the throwable that caused the exception during posting of this message, or null if none was available + * + * Note that getting a null valuable from this function doesn't not imply that no error occurred. Some + * errors that occurred many not have generated a throwable. + * + * @return the Throwable that caused the error, or null if no error occurred or was not caused by a throwable + */ + public Throwable getErrorThrown() { + return errorThrown; + } + + /** + * Helper method to format the exception that occurred during posting, or a string saying none occurred + * @return a non-null string + */ + @Ensures("result != null") + protected String formatError() { + return exceptionOccurredDuringPost() + ? String.format("Exception message=%s with cause=%s", getErrorMessage(), getErrorThrown()) + : "No exception occurred"; + } + + // --------------------------------------------------------------------------- + // + // Equals and hashcode -- purely for comparing reports for testing + // + // --------------------------------------------------------------------------- + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + GATKRunReport that = (GATKRunReport) o; + + if (maxMemory != that.maxMemory) return false; + if (nIterations != that.nIterations) return false; + if (numThreads != that.numThreads) return false; + if (runTime != that.runTime) return false; + if (totalMemory != that.totalMemory) return false; + if (endTime != null ? !endTime.equals(that.endTime) : that.endTime != null) return false; + if (hostName != null ? !hostName.equals(that.hostName) : that.hostName != null) return false; + if (id != null ? !id.equals(that.id) : that.id != null) return false; + if (javaVersion != null ? !javaVersion.equals(that.javaVersion) : that.javaVersion != null) return false; + if (mException != null ? !mException.equals(that.mException) : that.mException != null) return false; + if (machine != null ? !machine.equals(that.machine) : that.machine != null) return false; + if (percentTimeBlocking != null ? !percentTimeBlocking.equals(that.percentTimeBlocking) : that.percentTimeBlocking != null) + return false; + if (percentTimeRunning != null ? !percentTimeRunning.equals(that.percentTimeRunning) : that.percentTimeRunning != null) + return false; + if (percentTimeWaiting != null ? !percentTimeWaiting.equals(that.percentTimeWaiting) : that.percentTimeWaiting != null) + return false; + if (percentTimeWaitingForIO != null ? !percentTimeWaitingForIO.equals(that.percentTimeWaitingForIO) : that.percentTimeWaitingForIO != null) + return false; + if (startTime != null ? !startTime.equals(that.startTime) : that.startTime != null) return false; + if (svnVersion != null ? !svnVersion.equals(that.svnVersion) : that.svnVersion != null) return false; + if (tag != null ? !tag.equals(that.tag) : that.tag != null) return false; + if (userName != null ? !userName.equals(that.userName) : that.userName != null) return false; + if (walkerName != null ? !walkerName.equals(that.walkerName) : that.walkerName != null) return false; + + return true; + } + + @Override + public int hashCode() { + int result = id != null ? id.hashCode() : 0; + result = 31 * result + (mException != null ? mException.hashCode() : 0); + result = 31 * result + (startTime != null ? startTime.hashCode() : 0); + result = 31 * result + (endTime != null ? endTime.hashCode() : 0); + result = 31 * result + (int) (runTime ^ (runTime >>> 32)); + result = 31 * result + (walkerName != null ? walkerName.hashCode() : 0); + result = 31 * result + (svnVersion != null ? svnVersion.hashCode() : 0); + result = 31 * result + (int) (totalMemory ^ (totalMemory >>> 32)); + result = 31 * result + (int) (maxMemory ^ (maxMemory >>> 32)); + result = 31 * result + (userName != null ? userName.hashCode() : 0); + result = 31 * result + (hostName != null ? hostName.hashCode() : 0); + result = 31 * result + (javaVersion != null ? javaVersion.hashCode() : 0); + result = 31 * result + (machine != null ? machine.hashCode() : 0); + result = 31 * result + (int) (nIterations ^ (nIterations >>> 32)); + result = 31 * result + (tag != null ? tag.hashCode() : 0); + result = 31 * result + numThreads; + result = 31 * result + (percentTimeRunning != null ? percentTimeRunning.hashCode() : 0); + result = 31 * result + (percentTimeWaiting != null ? percentTimeWaiting.hashCode() : 0); + result = 31 * result + (percentTimeBlocking != null ? percentTimeBlocking.hashCode() : 0); + result = 31 * result + (percentTimeWaitingForIO != null ? percentTimeWaitingForIO.hashCode() : 0); + return result; + } + + // --------------------------------------------------------------------------- + // + // Code specifically for testing the GATKRunReport + // + // --------------------------------------------------------------------------- + + /** + * Enum specifying how the S3 uploader should behave. Must be normal by default. Purely for testing purposes + */ + protected enum AWSMode { + NORMAL, // write normally to AWS + FAIL_WITH_EXCEPTION, // artificially fail during writing + TIMEOUT // sleep, so we time out + } + /** Our AWS mode */ + private AWSMode awsMode = AWSMode.NORMAL; + /** The bucket were we send the GATK report on AWS/s3 */ + private String s3ReportBucket = REPORT_BUCKET_NAME; + /** Did we send the report to AWS? */ + private boolean wentToAWS = false; + + /** + * Send the report to the AWS test bucket -- for testing only + */ + protected void sendAWSToTestBucket() { + s3ReportBucket = TEST_REPORT_BUCKET_NAME; + } + + /** + * Has the report been written to AWS? + * + * Does not imply anything about the success of the send, just that it was attempted + * + * @return true if the report has been sent to AWS, false otherwise + */ + protected boolean wentToAWS() { + return wentToAWS; + } + + /** + * Purely for testing purposes. Tells the AWS uploader whether to actually upload or simulate errors + * @param mode what we want to do + */ + @Requires("mode != null") + protected void setAwsMode(final AWSMode mode) { + this.awsMode = mode; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportException.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRArgumentSet.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRArgumentSet.java new file mode 100644 index 000000000..497eafe68 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRArgumentSet.java @@ -0,0 +1,85 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; + +import java.io.File; + +public class BQSRArgumentSet { + // declare public, STL-style for easier and more efficient access: + private File BQSR_RECAL_FILE; + private int quantizationLevels; + private boolean disableIndelQuals; + private boolean emitOriginalQuals; + private int PRESERVE_QSCORES_LESS_THAN; + private double globalQScorePrior; + + public BQSRArgumentSet(final GATKArgumentCollection args) { + this.BQSR_RECAL_FILE = args.BQSR_RECAL_FILE; + this.quantizationLevels = args.quantizationLevels; + this.disableIndelQuals = args.disableIndelQuals; + this.emitOriginalQuals = args.emitOriginalQuals; + this.PRESERVE_QSCORES_LESS_THAN = args.PRESERVE_QSCORES_LESS_THAN; + this.globalQScorePrior = args.globalQScorePrior; + } + + public File getRecalFile() { return BQSR_RECAL_FILE; } + + public int getQuantizationLevels() { return quantizationLevels; } + + public boolean shouldDisableIndelQuals() { return disableIndelQuals; } + + public boolean shouldEmitOriginalQuals() { return emitOriginalQuals; } + + public int getPreserveQscoresLessThan() { return PRESERVE_QSCORES_LESS_THAN; } + + public double getGlobalQScorePrior() { return globalQScorePrior; } + + public void setRecalFile(final File BQSR_RECAL_FILE) { + this.BQSR_RECAL_FILE = BQSR_RECAL_FILE; + } + + public void setQuantizationLevels(final int quantizationLevels) { + this.quantizationLevels = quantizationLevels; + } + + public void setDisableIndelQuals(final boolean disableIndelQuals) { + this.disableIndelQuals = disableIndelQuals; + } + + public void setEmitOriginalQuals(final boolean emitOriginalQuals) { + this.emitOriginalQuals = emitOriginalQuals; + } + + public void setPreserveQscoresLessThan(final int PRESERVE_QSCORES_LESS_THAN) { + this.PRESERVE_QSCORES_LESS_THAN = PRESERVE_QSCORES_LESS_THAN; + } + + public void setGlobalQScorePrior(final double globalQScorePrior) { + this.globalQScorePrior = globalQScorePrior; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRMode.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRMode.java new file mode 100644 index 000000000..de6500e19 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/recalibration/BQSRMode.java @@ -0,0 +1,55 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.recalibration; + +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface BQSRMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/resourcemanagement/ThreadAllocation.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Affection.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Gender.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/MendelianViolation.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/MendelianViolation.java new file mode 100644 index 000000000..a37eb8d88 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/MendelianViolation.java @@ -0,0 +1,461 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.samples; + +import org.broadinstitute.gatk.engine.samples.Sample; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeType; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.gatk.utils.MathUtils; + +import java.util.*; + +/** + * User: carneiro / lfran + * Date: 3/9/11 + * Time: 12:38 PM + * + * Class for the identification and tracking of mendelian violation. It can be used in 2 distinct ways: + * - Either using an instance of the MendelianViolation class to track mendelian violations for each of the families while + * walking over the variants + * - Or using the static methods to directly get information about mendelian violation in a family at a given locus + * + */ +public class MendelianViolation { + //List of families with violations + private List violationFamilies; + + //Call information + private int nocall = 0; + private int familyCalled = 0; + private int varFamilyCalled = 0; + private int lowQual = 0; + + private boolean allCalledOnly = true; + + //Stores occurrences of inheritance + private EnumMap>> inheritance; + + private int violations_total=0; + + private double minGenotypeQuality; + + private boolean abortOnSampleNotFound; + + //Number of families with genotype information for all members + public int getFamilyCalledCount(){ + return familyCalled; + } + + //Number of families with genotype information for all members + public int getVarFamilyCalledCount(){ + return varFamilyCalled; + } + + //Number of families missing genotypes for one or more of their members + public int getFamilyNoCallCount(){ + return nocall; + } + + //Number of families with genotypes below the set quality threshold + public int getFamilyLowQualsCount(){ + return lowQual; + } + + public int getViolationsCount(){ + return violations_total; + } + + //Count of alt alleles inherited from het parents (no violation) + public int getParentHetInheritedVar(){ + return getParentsHetHetInheritedVar() + getParentsRefHetInheritedVar() + getParentsVarHetInheritedVar(); + } + + //Count of ref alleles inherited from het parents (no violation) + public int getParentHetInheritedRef(){ + return getParentsHetHetInheritedRef() + getParentsRefHetInheritedRef() + getParentsVarHetInheritedRef(); + } + + //Count of HomRef/HomRef/HomRef trios + public int getRefRefRef(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); + } + + //Count of HomVar/HomVar/HomVar trios + public int getVarVarVar(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR); + } + + //Count of HomRef/HomVar/Het trios + public int getRefVarHet(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HET) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HET); + } + + //Count of Het/Het/Het trios + public int getHetHetHet(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET); + } + + //Count of Het/Het/HomRef trios + public int getHetHetHomRef(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF); + } + + //Count of Het/Het/HomVar trios + public int getHetHetHomVar(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR); + } + + //Count of ref alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedRef(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET) + + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF); + //return parentsHetHet_childRef; + } + + //Count of var alleles inherited from Het/Het parents (no violation) + public int getParentsHetHetInheritedVar(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET) + + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR); + //return parentsHetHet_childVar; + } + + //Count of ref alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedRef(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_REF) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); + //return parentsHomRefHet_childRef; + } + + //Count of var alleles inherited from HomRef/Het parents (no violation) + public int getParentsRefHetInheritedVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HET) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HET); + //return parentsHomRefHet_childVar; + } + + //Count of ref alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedRef(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HET) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HET); + //return parentsHomVarHet_childRef; + } + + //Count of var alleles inherited from HomVar/Het parents (no violation) + public int getParentsVarHetInheritedVar(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_VAR) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR); + //return parentsHomVarHet_childVar; + } + + //Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR + public int getParentsRefRefChildVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_REF -> HET + public int getParentsRefRefChildHet(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HET); + } + + //Count of violations of the type HOM_REF/HET -> HOM_VAR + public int getParentsRefHetChildVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_VAR) + + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR + public int getParentsRefVarChildVar(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); + } + + //Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF + public int getParentsRefVarChildRef(){ + return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HET -> HOM_REF + public int getParentsVarHetChildRef(){ + return inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF) + + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF + public int getParentsVarVarChildRef(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF); + } + + //Count of violations of the type HOM_VAR/HOM_VAR -> HET + public int getParentsVarVarChildHet(){ + return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HET); + } + + + //Count of violations of the type HOM_VAR/? -> HOM_REF + public int getParentVarChildRef(){ + return getParentsRefVarChildRef() + getParentsVarHetChildRef() +getParentsVarVarChildRef(); + } + + //Count of violations of the type HOM_REF/? -> HOM_VAR + public int getParentRefChildVar(){ + return getParentsRefVarChildVar() + getParentsRefHetChildVar() +getParentsRefRefChildVar(); + } + + //Returns a String containing all trios where a Mendelian violation was observed. + //The String is formatted "mom1+dad1=child1,mom2+dad2=child2,..." + public String getViolationFamiliesString(){ + if(violationFamilies.isEmpty()) + return ""; + + Iterator it = violationFamilies.iterator(); + String violationFams = it.next(); + while(it.hasNext()){ + violationFams += ","+it.next(); + } + return violationFams; + } + + public List getViolationFamilies(){ + return violationFamilies; + } + + static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; + static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; + + public double getMinGenotypeQuality() { + return minGenotypeQuality; + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * + */ + public MendelianViolation(double minGenotypeQualityP) { + this(minGenotypeQualityP,true); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + } + + /** + * Constructor + * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation + * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. + * @param completeTriosOnly - whether only complete trios are considered or parent/child pairs are too. + */ + public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound, boolean completeTriosOnly) { + minGenotypeQuality = minGenotypeQualityP; + this.abortOnSampleNotFound = abortOnSampleNotFound; + violationFamilies = new ArrayList(); + createInheritanceMap(); + allCalledOnly = completeTriosOnly; + } + + /** + * @param families the families to be checked for Mendelian violations + * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. + * @return whether or not there is a mendelian violation at the site. + */ + public int countViolations(Map> families, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + + for(Set family : families.values()){ + Iterator sampleIterator = family.iterator(); + Sample sample; + while(sampleIterator.hasNext()){ + sample = sampleIterator.next(); + if(sample.getParents().size() > 0) + updateViolations(sample.getFamilyID(),sample.getMaternalID(), sample.getPaternalID(), sample.getID() ,vc); + } + } + return violations_total; + } + + public boolean isViolation(Sample mother, Sample father, Sample child, VariantContext vc){ + + //Reset counts + nocall = 0; + lowQual = 0; + familyCalled = 0; + varFamilyCalled = 0; + violations_total=0; + violationFamilies.clear(); + clearInheritanceMap(); + updateViolations(mother.getFamilyID(),mother.getID(),father.getID(),child.getID(),vc); + return violations_total>0; + } + + + private void updateViolations(String familyId, String motherId, String fatherId, String childId, VariantContext vc){ + + int count; + Genotype gMom = vc.getGenotype(motherId); + Genotype gDad = vc.getGenotype(fatherId); + Genotype gChild = vc.getGenotype(childId); + + if (gMom == null || gDad == null || gChild == null){ + if(abortOnSampleNotFound) + throw new IllegalArgumentException(String.format("Variant %s:%d: Missing genotypes for family %s: mom=%s dad=%s family=%s", vc.getChr(), vc.getStart(), familyId, motherId, fatherId, childId)); + else + return; + } + //Count No calls + if(allCalledOnly && (!gMom.isCalled() || !gDad.isCalled() || !gChild.isCalled())){ + nocall++; + } + else if (!gMom.isCalled() && !gDad.isCalled() || !gChild.isCalled()){ + nocall++; + } + //Count lowQual. Note that if min quality is set to 0, even values with no quality associated are returned + else if (minGenotypeQuality>0 && (gMom.getPhredScaledQual() < minGenotypeQuality || + gDad.getPhredScaledQual() < minGenotypeQuality || + gChild.getPhredScaledQual() < minGenotypeQuality )) { + lowQual++; + } + else{ + //Count all families per loci called + familyCalled++; + //If the family is all homref, not too interesting + if(!(gMom.isHomRef() && gDad.isHomRef() && gChild.isHomRef())) + { + varFamilyCalled++; + if(isViolation(gMom, gDad, gChild)){ + violationFamilies.add(familyId); + violations_total++; + } + } + count = inheritance.get(gMom.getType()).get(gDad.getType()).get(gChild.getType()); + inheritance.get(gMom.getType()).get(gDad.getType()).put(gChild.getType(),count+1); + + } + } + + /** + * Evaluate the genotypes of mom, dad, and child to detect Mendelian violations + * + * @param gMom + * @param gDad + * @param gChild + * @return true if the three genotypes represent a Mendelian violation; false otherwise + */ + public static boolean isViolation(final Genotype gMom, final Genotype gDad, final Genotype gChild) { + //1 parent is no "call + if(!gMom.isCalled()){ + return (gDad.isHomRef() && gChild.isHomVar()) || (gDad.isHomVar() && gChild.isHomRef()); + } + else if(!gDad.isCalled()){ + return (gMom.isHomRef() && gChild.isHomVar()) || (gMom.isHomVar() && gChild.isHomRef()); + } + //Both parents have genotype information + return !(gMom.getAlleles().contains(gChild.getAlleles().get(0)) && gDad.getAlleles().contains(gChild.getAlleles().get(1)) || + gMom.getAlleles().contains(gChild.getAlleles().get(1)) && gDad.getAlleles().contains(gChild.getAlleles().get(0))); + } + + private void createInheritanceMap(){ + + inheritance = new EnumMap>>(GenotypeType.class); + for(GenotypeType mType : GenotypeType.values()){ + inheritance.put(mType, new EnumMap>(GenotypeType.class)); + for(GenotypeType dType : GenotypeType.values()){ + inheritance.get(mType).put(dType, new EnumMap(GenotypeType.class)); + for(GenotypeType cType : GenotypeType.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } + + } + + private void clearInheritanceMap(){ + for(GenotypeType mType : GenotypeType.values()){ + for(GenotypeType dType : GenotypeType.values()){ + for(GenotypeType cType : GenotypeType.values()){ + inheritance.get(mType).get(dType).put(cType, 0); + } + } + } + } + + /** + * @return the likelihood ratio for a mendelian violation + */ + public double violationLikelihoodRatio(VariantContext vc, String motherId, String fatherId, String childId) { + double[] logLikAssignments = new double[27]; + // the matrix to set up is + // MOM DAD CHILD + // |- AA + // AA AA | AB + // |- BB + // |- AA + // AA AB | AB + // |- BB + // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs + double[] momGL = vc.getGenotype(motherId).getLikelihoods().getAsVector(); + double[] dadGL = vc.getGenotype(fatherId).getLikelihoods().getAsVector(); + double[] childGL = vc.getGenotype(childId).getLikelihoods().getAsVector(); + int offset = 0; + for ( int oMom = 0; oMom < 3; oMom++ ) { + for ( int oDad = 0; oDad < 3; oDad++ ) { + for ( int oChild = 0; oChild < 3; oChild ++ ) { + logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild]; + } + } + } + double[] mvLiks = new double[12]; + double[] nonMVLiks = new double[15]; + for ( int i = 0; i < 12; i ++ ) { + mvLiks[i] = logLikAssignments[mvOffsets[i]]; + } + + for ( int i = 0; i < 15; i++) { + nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]]; + } + + return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks); + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedReader.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/PedigreeValidationType.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Sample.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDB.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java new file mode 100644 index 000000000..2744bec61 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java @@ -0,0 +1,161 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.samples; + +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * + */ +public class SampleDBBuilder { + PedigreeValidationType validationStrictness; + final SampleDB sampleDB = new SampleDB(); + final GenomeAnalysisEngine engine; + + Set samplesFromDataSources = new HashSet(); + Set samplesFromPedigrees = new HashSet(); + + /** for testing only */ + protected SampleDBBuilder(PedigreeValidationType validationStrictness) { + engine = null; + this.validationStrictness = validationStrictness; + } + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { + this.engine = engine; + this.validationStrictness = validationStrictness; + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { + addSamplesFromSampleNames(ReadUtils.getSAMFileSamples(header)); + return this; + } + + public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { + for (final String sampleName : sampleNames) { + if (sampleDB.getSample(sampleName) == null) { + final Sample newSample = new Sample(sampleName, sampleDB); + sampleDB.addSample(newSample); + samplesFromDataSources.add(newSample); // keep track of data source samples + } + } + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { + for (final File pedFile : pedigreeFiles) { + Collection samples = addSamplesFromPedigreeArgument(pedFile); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { + for (final String pedString : pedigreeStrings) { + Collection samples = addSamplesFromPedigreeArgument(pedString); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + private Collection addSamplesFromPedigreeArgument(File sampleFile) { + final PedReader reader = new PedReader(); + + try { + return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleFile, e); + } + } + + private Collection addSamplesFromPedigreeArgument(final String string) { + final PedReader reader = new PedReader(); + return reader.parse(string, getMissingFields(string), sampleDB); + } + + public SampleDB getFinalSampleDB() { + validate(); + return sampleDB; + } + + public EnumSet getMissingFields(final Object engineArg) { + if ( engine == null ) + return EnumSet.noneOf(PedReader.MissingPedField.class); + else { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } + } + + // -------------------------------------------------------------------------------- + // + // Validation + // + // -------------------------------------------------------------------------------- + + protected final void validate() { + validatePedigreeIDUniqueness(); + if ( validationStrictness != PedigreeValidationType.SILENT ) { + // check that samples in data sources are all annotated, if anything is annotated + if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { + final Set sampleNamesFromPedigrees = new HashSet(); + for ( final Sample pSample : samplesFromPedigrees ) + sampleNamesFromPedigrees.add(pSample.getID()); + + for ( final Sample dsSample : samplesFromDataSources ) + if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) + throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation"); + } + } + } + + private void validatePedigreeIDUniqueness() { + Set pedigreeIDs = new HashSet(); + for ( Sample sample : samplesFromPedigrees ) { + pedigreeIDs.add(sample.getID()); + } + assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?"; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/samples/Trio.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/ArtificialReadsTraversal.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/ArtificialReadsTraversal.java new file mode 100644 index 000000000..ac34b7594 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/ArtificialReadsTraversal.java @@ -0,0 +1,142 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.ShardDataProvider; +import org.broadinstitute.gatk.engine.walkers.ReadWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.sam.ArtificialPatternedSAMIterator; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + * + * this class acts as a fake reads traversal engine for testing out reads based traversals. + */ +public class ArtificialReadsTraversal extends TraversalEngine,ShardDataProvider> { + + public int startingChr = 1; + public int endingChr = 5; + public int readsPerChr = 100; + public int unMappedReads = 1000; + private int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; + private ArtificialPatternedSAMIterator iter; + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(ArtificialReadsTraversal.class); + + /** Creates a new, uninitialized ArtificialReadsTraversal */ + public ArtificialReadsTraversal() { + } + + // what read ordering are we using + private ArtificialPatternedSAMIterator.PATTERN readOrder = ArtificialPatternedSAMIterator.PATTERN.IN_ORDER_READS; + + + /** + * set the read ordering of the reads given to the walker + * + * @param readOrdering + */ + public void setReadOrder( ArtificialPatternedSAMIterator.PATTERN readOrdering ) { + readOrder = readOrdering; + } + + @Override + public String getTraversalUnits() { + return "reads"; + } + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to traverse with + * @param dataProvider the provider of the reads data + * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function + * + * @return the reduce variable of the read walker + */ + public T traverse( Walker walker, + ShardDataProvider dataProvider, + T sum ) { + + if (!( walker instanceof ReadWalker )) + throw new IllegalArgumentException("Walker isn't a read walker!"); + + ReadWalker readWalker = (ReadWalker) walker; + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readsPerChr + DEFAULT_READ_LENGTH); + iter = new ArtificialPatternedSAMIterator(this.startingChr, + this.endingChr, + this.readsPerChr, + this.unMappedReads, + header, + this.readOrder); + + // while we still have more reads + for (SAMRecord read : iter) { + + // an array of characters that represent the reference + ReferenceContext refSeq = null; + + final boolean keepMeP = readWalker.filter(refSeq, (GATKSAMRecord) read); + if (keepMeP) { + M x = readWalker.map(refSeq, (GATKSAMRecord) read, null); // TODO: fix me at some point, it would be nice to fake out ROD data too + sum = readWalker.reduce(x, sum); + } + } + return sum; + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java new file mode 100644 index 000000000..f84824a59 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java @@ -0,0 +1,168 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import org.broadinstitute.gatk.utils.downsampling.Downsampler; +import org.broadinstitute.gatk.utils.downsampling.ReservoirDownsampler; +import org.broadinstitute.gatk.utils.sam.AlignmentStartComparator; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Subsystem to track a list of all reads currently live in the TraverseActiveRegions system, + * while limiting the total number of reads to a maximum capacity. + * + * User: depristo + * Date: 4/7/13 + * Time: 11:23 AM + */ +public class TAROrderedReadCache { + private final int maxCapacity; + private ArrayList undownsampledCache; + private Downsampler downsampler; + + private static final int UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE = 10000; + + /** + * Create a new empty ReadCache + * @param maxCapacity the max capacity of the read cache. + */ + public TAROrderedReadCache( final int maxCapacity ) { + if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity); + this.maxCapacity = maxCapacity; + + // The one we're not currently using will always be null: + initializeUndownsampledCache(); + this.downsampler = null; + } + + /** + * Moves all reads over to the downsampler, causing it to be used from this point on. Should be called + * when the undownsampledCache fills up and we need to start discarding reads. Since the + * ReservoirDownsampler doesn't preserve relative ordering, pop operations become expensive + * after this point, as they require a O(n log n) sort. + */ + private void activateDownsampler() { + downsampler = new ReservoirDownsampler<>(maxCapacity, false); + downsampler.submit(undownsampledCache); + undownsampledCache = null; // preferable to the O(n) clear() method + } + + /** + * Allocate the undownsampled cache used when we have fewer than maxCapacity items + */ + private void initializeUndownsampledCache() { + undownsampledCache = new ArrayList<>(Math.min(maxCapacity + 1, UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE)); + } + + /** + * What's the maximum number of reads we'll store in the cache? + * @return a positive integer + */ + public int getMaxCapacity() { + return maxCapacity; + } + + /** + * Add a single read to this cache. Assumed to be in sorted order w.r.t. the previously added reads + * @param read a read to add + */ + public void add( final GATKSAMRecord read ) { + if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); + + if ( downsampler != null ) { + downsampler.submit(read); + } + else { + undownsampledCache.add(read); + + // No more room in the undownsampledCache? Time to start downsampling + if ( undownsampledCache.size() > maxCapacity ) { + activateDownsampler(); + } + } + } + + /** + * Add a collection of reads to this cache. Assumed to be in sorted order w.r.t. the previously added reads and each other + * @param reads a collection of reads to add + */ + public void addAll( final List reads ) { + if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null"); + for ( final GATKSAMRecord read : reads ) { + add(read); + } + } + + /** + * How many reads are currently in the cache? + * @return a positive integer + */ + public int size() { + return downsampler != null ? downsampler.size() : undownsampledCache.size(); + } + + /** + * How many reads were discarded since the last call to popCurrentReads + * + * @return number of items discarded during downsampling since last pop operation + */ + public int getNumDiscarded() { + return downsampler != null ? downsampler.getNumberOfDiscardedItems() : 0; + } + + /** + * Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart) + * + * Flushes this cache, so after this call the cache will contain no reads, and we'll be in the same + * initial state as the constructor would put us in, with a non-null undownsampledCache and a null + * downsampler. + * + * @return a list of GATKSAMRecords in this cache + */ + public List popCurrentReads() { + final List poppedReads; + + if ( downsampler == null ) { + poppedReads = undownsampledCache; // avoid making a copy here, since we're going to allocate a new cache + } + else { + // If we triggered the downsampler, we need to sort the reads before returning them, + // since the ReservoirDownsampler is not guaranteed to preserve relative ordering of items. + // After consuming the downsampled items in this call to popCurrentReads(), we switch back + // to using the undownsampledCache until we fill up again. + poppedReads = downsampler.consumeFinalizedItems(); // avoid making a copy here + Collections.sort(poppedReads, new AlignmentStartComparator()); + downsampler = null; + } + + initializeUndownsampledCache(); + return poppedReads; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraversalEngine.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java new file mode 100644 index 000000000..b3a0603f4 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java @@ -0,0 +1,719 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.*; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.ActiveRegionTraversalParameters; +import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfile; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfile; +import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.io.PrintStream; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Implement active region traversal + * + * User: depristo + * Date: 1/9/13 + * Time: 4:45 PM + * + * Live region: + * + * The ART tracks a thing called the live region. The live region is a position on a specific contig + * of the alignment start of the last read we processed during this traversal. Because the + * read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region + * (everything to the left of the live boundary) cannot have any more read data. The live / dead + * regions are used to decide when we can safely call map on active regions, as only active regions + * contained completely within the dead region (including extensions) have a complete set of read data + * in the collected read list. All of the data related to the live region is captured by the local + * variable spanOfLastReadSeen + * + */ +public final class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + private final static boolean DEBUG = false; + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); + protected final static boolean LOG_READ_CARRYING = false; + + // set by the traversal + private boolean walkerHasPresetRegions = false; + private int activeRegionExtension = -1; + private int maxRegionSize = -1; + private int minRegionSize = -1; + + private final LinkedList workQueue = new LinkedList<>(); + + private TAROrderedReadCache myReads = null; + + private GenomeLoc lastRegionProcessed = null; + private GenomeLoc spanOfLastReadSeen = null; + private ActivityProfile activityProfile = null; + int maxReadsInMemory = 0; + ActiveRegionWalker walker; + + final NanoScheduler nanoScheduler; + + /** + * Data to use in the ActiveRegionWalker.map function produced by the NanoScheduler input iterator + */ + private static class MapData { + public ActiveRegion activeRegion; + public RefMetaDataTracker tracker; + + private MapData(ActiveRegion activeRegion, RefMetaDataTracker tracker) { + this.activeRegion = activeRegion; + this.tracker = tracker; + } + } + + /** + * Create a single threaded active region traverser + */ + public TraverseActiveRegions() { + this(1); + } + + /** + * Create an active region traverser that uses nThreads for getting its work done + * @param nThreads number of threads + */ + public TraverseActiveRegions(final int nThreads) { + nanoScheduler = new NanoScheduler<>(nThreads); + nanoScheduler.setProgressFunction(new NSProgressFunction() { + @Override + public void progress(MapData lastActiveRegion) { + if ( lastActiveRegion != null ) + // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon + printProgress(lastActiveRegion.activeRegion.getLocation().getStopLocation()); + } + }); + } + + /** + * Have the debugging output streams been initialized already? + * + * We have to do lazy initialization because when the initialize() function is called + * the streams aren't yet initialized in the GATK walker. + */ + private boolean streamsInitialized = false; + + @Override + public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) { + super.initialize(engine, walker, progressMeter); + + this.walker = (ActiveRegionWalker)walker; + if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) { + throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " + + "non-primary reads, an inconsistent state. Please modify the walker"); + } + + ActiveRegionTraversalParameters annotation = walker.getClass().getAnnotation(ActiveRegionTraversalParameters.class); + this.activeRegionExtension = this.walker.activeRegionExtension == null ? annotation.extension() : this.walker.activeRegionExtension; + this.maxRegionSize = this.walker.activeRegionMaxSize == null ? annotation.maxRegion() : this.walker.activeRegionMaxSize; + this.minRegionSize = annotation.minRegion(); + final double bandPassSigma = this.walker.bandPassSigma == null ? annotation.bandPassSigma() : this.walker.bandPassSigma; + walkerHasPresetRegions = this.walker.hasPresetActiveRegions(); + + activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), this.walker.maxProbPropagationDistance, this.walker.activeProbThreshold, + BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma); + + final int maxReadsAcrossSamples = annotation.maxReadsToHoldInMemoryPerSample() * ReadUtils.getSAMFileSamples(engine.getSAMFileHeader()).size(); + final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, annotation.maxReadsToHoldTotal()); + myReads = new TAROrderedReadCache(maxReadsToHoldInMemory); + } + + // ------------------------------------------------------------------------------------- + // + // Utility functions + // + // ------------------------------------------------------------------------------------- + + /** + * Load in the preset regions for contig into workQueue + * + * Should be called before starting to process work on contig + * + * Can only be called when walkerHasPresetRegions is true or an IllegalStateException will be thrown + * + * @param contig the contig we are about to process + */ + protected void loadPresetRegionsForContigToWorkQueue(final String contig) { + if ( ! walkerHasPresetRegions ) throw new IllegalStateException("only appropriate to call when walker has preset regions"); + + final GenomeLoc contigSpan = engine.getGenomeLocParser().createOverEntireContig(contig); + for ( final GenomeLoc loc : this.walker.getPresetActiveRegions().getOverlapping(contigSpan) ) { + workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension())); + } + } + + protected int getActiveRegionExtension() { + return activeRegionExtension; + } + + protected int getMaxRegionSize() { + return maxRegionSize; + } + + protected int getMinRegionSize() { + return minRegionSize; + } + + @Override + public String getTraversalUnits() { + return "active regions"; + } + + @Override + public String toString() { + return "TraverseActiveRegions"; + } + + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + protected boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + // ------------------------------------------------------------------------------------- + // + // Actual traverse function + // + // ------------------------------------------------------------------------------------- + + /** + * Did read appear in the last shard? + * + * When we transition across shard boundaries we see duplicate reads because + * each shard contains the reads that *overlap* the shard. So if we just finished + * shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001 + * that overlapped 1-1000. This function tests read to determine if we would have + * seen it before by asking if read.getAlignmentStart() is less than the + * stop position of the last seen read at the start of the traversal. The reason + * we need to use the location of the last read at the start of the traversal + * is that we update the lastRead during the traversal, and we only want to filter + * out reads whose start is before the last read of the previous shard, not the + * current shard. + * + * @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal + * @param read the read we want to test if it's already been seen in the last shard + * @return true if read would have appeared in the last shard, false otherwise + */ + @Requires({"read != null"}) + private boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) { + if ( locOfLastReadAtTraversalStart == null ) + // we're in the first shard, so obviously the answer is no + return false; + else { + // otherwise check to see if the alignment occurred in the previous shard + return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart() + // we're on the same contig + && read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex(); + } + + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + if ( LOG_READ_CARRYING || logger.isDebugEnabled() ) + logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); + + nanoScheduler.setDebug(false); + final Iterator activeRegionIterator = new ActiveRegionIterator(dataProvider); + final TraverseActiveRegionMap myMap = new TraverseActiveRegionMap(); + final TraverseActiveRegionReduce myReduce = new TraverseActiveRegionReduce(); + final T result = nanoScheduler.execute(activeRegionIterator, myMap, sum, myReduce); + + return result; + } + + private class ActiveRegionIterator implements Iterator { + private final LocusShardDataProvider dataProvider; + private LinkedList readyActiveRegions = new LinkedList<>(); + private boolean done = false; + private final LocusView locusView; + private final LocusReferenceView referenceView; + private final GenomeLoc locOfLastReadAtTraversalStart; + private final IntervalReferenceOrderedView referenceOrderedDataView; + private final GenomeLoc currentWindow; + private final boolean processRemainingActiveRegions; + + public ActiveRegionIterator( final LocusShardDataProvider dataProvider ) { + this.dataProvider = dataProvider; + locusView = new AllLocusView(dataProvider); + referenceView = new LocusReferenceView( walker, dataProvider ); + + // The data shard may carry a number of locations to process (due to being indexed together). + // This value is just the interval we are processing within the entire provider + currentWindow = dataProvider.getLocus(); + final int currentWindowPos = dataProvider.getShard().getGenomeLocs().indexOf(currentWindow); + if ( currentWindowPos == -1 ) throw new IllegalStateException("Data provider " + dataProvider + " didn't have our current window in it " + currentWindow); + processRemainingActiveRegions = currentWindowPos == dataProvider.getShard().getGenomeLocs().size() - 1; + + // the rodSpan covers all of the bases in the activity profile, including all of the bases + // through the current window interval. This is because we may issue a query to get data for an + // active region spanning before the current interval as far back as the start of the current profile, + // if we have pending work to do that finalizes in this interval. + final GenomeLoc rodSpan = activityProfile.getSpan() == null ? currentWindow : activityProfile.getSpan().endpointSpan(currentWindow); + if ( ! dataProvider.getShard().getLocation().containsP(rodSpan) ) throw new IllegalStateException("Rod span " + rodSpan + " isn't contained within the data shard " + dataProvider.getShard().getLocation() + ", meaning we wouldn't get all of the data we need"); + referenceOrderedDataView = new IntervalReferenceOrderedView( dataProvider, rodSpan ); + + // We keep processing while the next reference location is within the interval + locOfLastReadAtTraversalStart = spanOfLastSeenRead(); + + // load in the workQueue the present regions that span the current contig, if it's different from the last one + if ( walkerHasPresetRegions && ( lastRegionProcessed == null || ! currentWindow.onSameContig(lastRegionProcessed)) ) { + loadPresetRegionsForContigToWorkQueue(currentWindow.getContig()); + } + + // remember the last region we processed for sanity checking later + lastRegionProcessed = currentWindow; + } + + @Override public void remove() { throw new UnsupportedOperationException("Cannot remove from ActiveRegionIterator"); } + + @Override + public MapData next() { + return readyActiveRegions.pop(); + } + @Override + public boolean hasNext() { + if ( engine.exceedsRuntimeLimit() ) // too much time has been dedicated to doing work, just stop + return false; + if ( ! readyActiveRegions.isEmpty() ) + return true; + if ( done ) + return false; + else { + + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + rememberLastLocusLocation(location); + + // get all of the new reads that appear in the current pileup, and them to our list of reads + // provided we haven't seen them before + final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); + for( final GATKSAMRecord read : reads ) { + // note that ActiveRegionShards span entire contigs, so this check is in some + // sense no longer necessary, as any read that appeared in the last shard would now + // by definition be on a different contig. However, the logic here doesn't hurt anything + // and makes us robust should we decided to provide shards that don't fully span + // contigs at some point in the future + if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { + rememberLastReadLocation(read); + myReads.add(read); + } + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + // we've move across some interval boundary, restart profile + final boolean flushProfile = ! activityProfile.isEmpty() + && ( activityProfile.getContigIndex() != location.getContigIndex() + || location.getStart() != activityProfile.getStop() + 1); + final List newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false, referenceOrderedDataView); + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + addIsActiveResult(walker, tracker, refContext, locus); + + maxReadsInMemory = Math.max(myReads.size(), maxReadsInMemory); + printProgress(location); + + if ( ! newActiveRegions.isEmpty() ) { + readyActiveRegions.addAll(newActiveRegions); + if ( DEBUG ) + for ( final MapData region : newActiveRegions ) + logger.info("Adding region to queue for processing " + region.activeRegion); + return true; + } + } + + if ( processRemainingActiveRegions ) { + // we've run out of stuff to process, and since shards now span entire contig boundaries + // we should finalized our regions. This allows us to continue to use our referenceOrderedDataView + // which would otherwise be shutdown. Only followed when the microschedule says that we're + // inside of the last window in the current shard + readyActiveRegions.addAll(prepActiveRegionsForProcessing(walker, true, true, referenceOrderedDataView)); + } + + return ! readyActiveRegions.isEmpty(); + } + } + } + + // ------------------------------------------------------------------------------------- + // + // Functions to manage and interact with the live / dead zone + // + // ------------------------------------------------------------------------------------- + + /** + * Update the live region to reflect that the last read we've seen in the traversal is read + * + * Requires that sequential calls always be provided reads in coordinate sorted order + * + * @param read the last read we've seen during the traversal + */ + @Requires({"read != null"}) + protected void rememberLastReadLocation(final GATKSAMRecord read) { + final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read); + if ( spanOfLastReadSeen == null ) + spanOfLastReadSeen = currentLocation; + else { + if ( currentLocation.isBefore(spanOfLastReadSeen) ) + throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen); + spanOfLastReadSeen = currentLocation; + } + } + + /** + * Update the live region to reflect that we've reached locus + * + * This function is complementary to #rememberLastReadLocation, but if we don't have any reads for a long + * time (e.g., there's no coverage) we will keep active regions around far longer than necessary. + * + * Only updates the span if it's beyond the last seen + * + * @param currentLocation the current location we've processed on the genome + */ + protected void rememberLastLocusLocation(final GenomeLoc currentLocation) { + if ( spanOfLastReadSeen == null ) + spanOfLastReadSeen = currentLocation; + else { + if ( currentLocation.isPast(spanOfLastReadSeen) ) + spanOfLastReadSeen = currentLocation; + } + } + + + /** + * Get a GenomeLoc indicating the start (heading to the right) of the live ART region. + * @return the left-most position of the live region on the genome + */ + protected GenomeLoc spanOfLastSeenRead() { + return spanOfLastReadSeen; + } + + /** + * Is the active region completely within the traversal's dead zone? + * + * @param region the region we want to test + * @return true if the extended location of region is completely within the current dead zone, false otherwise + */ + protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) { + if ( spanOfLastSeenRead() == null ) + return false; + + final int contigCmp = region.getExtendedLoc().compareContigs(spanOfLastSeenRead()); + if ( contigCmp > 0 ) + throw new IllegalStateException("Active region " + region + " on a contig after last seen read " + spanOfLastSeenRead()); + else { + return contigCmp < 0 || region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart(); + } + } + + /** + * Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded? + * + * read: start |--------> stop ------ stop + extension + * region: start |-----------------| end + * + * Since the regions are coming in order, read could potentially be contained in a future interval if + * stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end + * of this region, then we can discard it, since any future region could only include reads + * up to end + 1 - extension. + * + * Note that this function doesn't care about the dead zone. We're assuming that by + * actually calling this function with an active region that region is already in the dead zone, + * so checking that the read is in the dead zone doesn't make sense. + * + * @param read the read we're testing + * @param activeRegion the current active region + * @return true if the read is dead, false other + */ + @Requires({"read != null", "activeRegion != null"}) + private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) { + return read.getReferenceIndex() < activeRegion.getLocation().getContigIndex() || + ( read.getReferenceIndex() == activeRegion.getLocation().getContigIndex() + && read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() ); + } + + // ------------------------------------------------------------------------------------- + // + // Functions to write out activity profiles and active regions + // + // ------------------------------------------------------------------------------------- + + /** + * Initialize the debugging output streams (activity profile and active regions), if not done so already + */ + @Ensures("streamsInitialized == true") + private void initializeOutputStreamsIfNecessary() { + if ( ! streamsInitialized ) { + streamsInitialized = true; + if ( walker.activityProfileOutStream != null ) { + printIGVFormatHeader(walker.activityProfileOutStream, "line", "ActivityProfile"); + } + + if ( walker.activeRegionOutStream != null ) { + printIGVFormatHeader(walker.activeRegionOutStream, "line", "ActiveRegions"); + } + } + } + + /** + * Helper function to write out a IGV formatted line to out, at loc, with values + * + * http://www.broadinstitute.org/software/igv/IGV + * + * @param out a non-null PrintStream where we'll write our line + * @param graphType the type of graph to show in IGV for this track + * @param columns the column names for this IGV track + */ + @Requires({ + "out != null", + "graphType != null", + "columns.length > 0" + }) + private void printIGVFormatHeader(final PrintStream out, final String graphType, final String ... columns ) { + out.printf("#track graphType=%s%n", graphType); + out.printf("Chromosome\tStart\tEnd\tFeature\t%s%n", Utils.join("\t", columns)); + + } + + /** + * Helper function to write out a IGV formatted line to out, at loc, with values + * + * http://www.broadinstitute.org/software/igv/IGV + * + * @param out a non-null PrintStream where we'll write our line + * @param loc the location of values + * @param featureName string name of this feature (see IGV format) + * @param values the floating point values to associate with loc and feature name in out + */ + @Requires({ + "out != null", + "loc != null", + "values.length > 0" + }) + private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) { + // note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1 + out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName); + for ( final double value : values ) + out.print(String.format("\t%.5f", value)); + out.println(); + } + + /** + * Write out activity profile information, if requested by the walker + * + * @param states the states in the current activity profile + */ + @Requires("states != null") + private void writeActivityProfile(final List states) { + if ( walker.activityProfileOutStream != null ) { + initializeOutputStreamsIfNecessary(); + for ( final ActivityProfileState state : states ) { + printIGVFormatRow(walker.activityProfileOutStream, state.getLoc(), "state", Math.min(state.isActiveProb, 1.0)); + } + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param region the region we're currently operating on + */ + @Requires("region != null") + private void writeActiveRegion(final ActiveRegion region) { + if( walker.activeRegionOutStream != null ) { + initializeOutputStreamsIfNecessary(); + printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(), + "end-marker", 0.0); + printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(), + "size=" + region.getLocation().size(), region.isActive() ? 1.0 : -1.0); + } + } + + + // ------------------------------------------------------------------------------------- + // + // Functions to process active regions that are ready for map / reduce calls + // + // ------------------------------------------------------------------------------------- + + /** + * Invoke the walker isActive function, and incorporate its result into the activity profile + * + * @param walker the walker we're running + * @param tracker the ref meta data tracker to pass on to the isActive function of walker + * @param refContext the refContext to pass on to the isActive function of walker + * @param locus the AlignmentContext to pass on to the isActive function of walker + */ + private void addIsActiveResult(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus) { + // must be called, even if we won't use the result, to satisfy walker contract + final ActivityProfileState state = walker.isActive( tracker, refContext, locus ); + if ( walker.forceActive) state.isActiveProb = 1.0; + if ( ! walkerHasPresetRegions ) { + activityProfile.add(state); + } + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + */ + private List prepActiveRegionsForProcessing(final ActiveRegionWalker walker, + final boolean flushActivityProfile, + final boolean forceAllRegionsToBeActive, + final IntervalReferenceOrderedView referenceOrderedDataView) { + if ( ! walkerHasPresetRegions ) { + // We don't have preset regions, so we get our regions from the activity profile + final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile); + workQueue.addAll(activeRegions); + if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + } + + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + final LinkedList readyRegions = new LinkedList<>(); + while( workQueue.peek() != null ) { + final ActiveRegion activeRegion = workQueue.peek(); + if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { + writeActivityProfile(activeRegion.getSupportingStates()); + writeActiveRegion(activeRegion); + readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker, referenceOrderedDataView)); + } else { + break; + } + } + + return readyRegions; + + } + + private MapData prepActiveRegionForProcessing(final ActiveRegion activeRegion, + final ActiveRegionWalker walker, + final IntervalReferenceOrderedView referenceOrderedDataView) { + final List stillLive = new LinkedList<>(); + for ( final GATKSAMRecord read : myReads.popCurrentReads() ) { + boolean killed = false; + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + activeRegion.add(read); + + if ( ! walker.wantsNonPrimaryReads() ) { + killed = true; + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + + // if the read hasn't already been killed, check if it cannot occur in any more active regions, and maybe kill it + if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) { + killed = true; + } + + // keep track of all of the still live active regions + if ( ! killed ) stillLive.add(read); + } + myReads.addAll(stillLive); + + if ( logger.isDebugEnabled() ) { + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc()); + } + + if ( LOG_READ_CARRYING ) + logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s", + activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory)); + + // prepare the RefMetaDataTracker information + final GenomeLoc loc = activeRegion.getLocation(); + // get all of the RODs that cover the active region (without extension) + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataForInterval(loc); + // trim away all of the features that occurred before this location, as we will not need them in the future + referenceOrderedDataView.trimCurrentFeaturesToLoc(loc); + + return new MapData(activeRegion, tracker); + } + + private class TraverseActiveRegionMap implements NSMapFunction { + @Override + public M apply(final MapData mapData) { + if ( DEBUG ) logger.info("Executing walker.map for " + mapData.activeRegion + " in thread " + Thread.currentThread().getName()); + return walker.map(mapData.activeRegion, mapData.tracker); + } + } + + private class TraverseActiveRegionReduce implements NSReduceFunction { + @Override + public T apply(M one, T sum) { + return walker.reduce(one, sum); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java new file mode 100644 index 000000000..a8c88aace --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java @@ -0,0 +1,205 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadView; +import org.broadinstitute.gatk.utils.iterators.PushbackIterator; +import org.broadinstitute.gatk.engine.walkers.DuplicateWalker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * @author Mark DePristo + * @version 0.1 + *

+ * Class TraverseDuplicates + *

+ * This class handles traversing lists of duplicate reads in the new shardable style + */ +public class TraverseDuplicates extends TraversalEngine,ReadShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static Logger logger = Logger.getLogger(TraverseDuplicates.class); + + /** Turn this to true to enable logger.debug output */ + private final boolean DEBUG = false; + + @Override + public String getTraversalUnits() { + return "dups"; + } + + private List readsAtLoc(final GATKSAMRecord read, PushbackIterator iter) { + GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); + ArrayList l = new ArrayList(); + + l.add(read); + for (SAMRecord read2 : iter) { + GenomeLoc site2 = engine.getGenomeLocParser().createGenomeLoc(read2); + + // the next read starts too late + if (site2.getStart() != site.getStart()) { + iter.pushback(read2); + break; + } else { + l.add((GATKSAMRecord) read2); + } + } + + return l; + } + + /** + * Creates a set of lists of reads, where each list contains reads from the same underlying molecule according + * to their duplicate flag and their (and mate, if applicable) start/end positions. + * + * @param reads the list of reads to split into unique molecular samples + * @return + */ + protected Set> uniqueReadSets(List reads) { + Set> readSets = new LinkedHashSet>(); + + // for each read, find duplicates, and either add the read to its duplicate list or start a new one + for ( GATKSAMRecord read : reads ) { + List readSet = findDuplicateReads(read, readSets); + + if ( readSet == null ) { + readSets.add(new ArrayList(Arrays.asList(read))); // copy so I can add to the list + } else { + readSet.add(read); + } + } + + return readSets; + } + + /** + * Find duplicate reads for read in the set of unique reads. This is effective a duplicate marking algorithm, + * but it relies for safety's sake on the file itself being marked by a true duplicate marking algorithm. Pair + * and single-end read aware. + * + * @param read + * @param readSets + * @return The list of duplicate reads that read is a member of, or null if it's the only one of its kind + */ + protected List findDuplicateReads(GATKSAMRecord read, Set> readSets ) { + if ( read.getReadPairedFlag() ) { + // paired + final GenomeLoc readMateLoc = engine.getGenomeLocParser().createGenomeLoc(read.getMateReferenceName(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); + + for (List reads : readSets) { + GATKSAMRecord key = reads.get(0); + + // read and key start at the same place, and either the this read and the key + // share a mate location or the read is flagged as a duplicate + if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) { + // at least one has to be marked as a duplicate + final GenomeLoc keyMateLoc = engine.getGenomeLocParser().createGenomeLoc(key.getMateReferenceName(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); + if ( readMateLoc.compareTo(keyMateLoc) == 0 ) { + // we are at the same position as the dup and have the same mat pos, it's a dup + if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc)); + return reads; + } + } + } + } else { + for (List reads : readSets) { + GATKSAMRecord key = reads.get(0); + boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength(); + //System.out.printf("%s %s %b %b %d %d %d %d => %b%n", + // read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(), + // read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v); + if ( v ) { + //System.out.printf("Returning reads...%n"); + return reads; + } + } + } + + return null; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // new style interface to the system + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to execute over + * @param sum of type T, the return from the walker + * + * @return the result type T, the product of all the reduce calls + */ + public T traverse(DuplicateWalker walker, + ReadShardDataProvider dataProvider, + T sum) { + PushbackIterator iter = new PushbackIterator(new ReadView(dataProvider).iterator()); + + /** + * while we still have more reads: + * ok, here's the idea. We get all the reads that start at the same position in the genome + * We then split the list of reads into sublists of reads: + * -> those with the same mate pair position, for paired reads + * -> those flagged as unpaired and duplicated but having the same start and end + */ + boolean done = walker.isDone(); + for (SAMRecord read : iter) { + if ( done ) break; + // get the genome loc from the read + GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); + + Set> readSets = uniqueReadSets(readsAtLoc((GATKSAMRecord) read, iter)); + if ( DEBUG ) logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size())); + + // Jump forward in the reference to this locus location + AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileupImpl(site)); + + // update the number of duplicate sets we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // actually call filter and map, accumulating sum + final boolean keepMeP = walker.filter(site, locus, readSets); + if (keepMeP) { + M x = walker.map(site, locus, readSets); + sum = walker.reduce(x, sum); + } + + printProgress(site.getStopLocation()); + done = walker.isDone(); + } + + return sum; + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java new file mode 100644 index 000000000..1c16c0e19 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java @@ -0,0 +1,304 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.*; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.DataSource; +import org.broadinstitute.gatk.engine.walkers.LocusWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; + +import java.util.Iterator; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { + /** our log, which we want to capture anything from this class */ + private static final boolean DEBUG = false; + + final NanoScheduler nanoScheduler; + + public TraverseLociNano(int nThreads) { + nanoScheduler = new NanoScheduler(nThreads); + nanoScheduler.setProgressFunction(new TraverseLociProgress()); + } + + @Override + public final String getTraversalUnits() { + return "sites"; + } + + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + @Override + public T traverse( LocusWalker walker, + LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); + sum = walker.reduce(x, sum); + } + } + + return sum; + } + + /** + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + + protected TraverseResults traverse(final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum) { + nanoScheduler.setDebug(DEBUG); + final TraverseLociMap myMap = new TraverseLociMap(walker); + final TraverseLociReduce myReduce = new TraverseLociReduce(walker); + + final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); + final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); + + return new TraverseResults(inputIterator.numIterations, result); + } + + /** + * Create iterator that provides inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + */ + private class MapDataIterator implements Iterator { + final LocusView locusView; + final LocusReferenceView referenceView; + final ReferenceOrderedView referenceOrderedDataView; + int numIterations = 0; + + private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { + this.locusView = locusView; + this.referenceView = referenceView; + this.referenceOrderedDataView = referenceOrderedDataView; + } + + @Override + public boolean hasNext() { + return locusView.hasNext() && ! engine.exceedsRuntimeLimit(); + } + + @Override + public MapData next() { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + //logger.info("Pulling data from MapDataIterator at " + location); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location); + + numIterations++; + return new MapData(locus, refContext, tracker); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); + } + } + + @Override + public void shutdown() { + nanoScheduler.shutdown(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final AlignmentContext alignmentContext; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.alignmentContext = alignmentContext; + this.refContext = refContext; + this.tracker = tracker; + } + + @Override + public String toString() { + return "MapData " + alignmentContext.getLocation(); + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseLociMap implements NSMapFunction { + final LocusWalker walker; + + private TraverseLociMap(LocusWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); + if (keepMeP) { + final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); + return new MapResult(x); + } + } + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseLociReduce implements NSReduceFunction { + final LocusWalker walker; + + private TraverseLociReduce(LocusWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } + + private class TraverseLociProgress implements NSProgressFunction { + @Override + public void progress(MapData lastProcessedMap) { + if (lastProcessedMap.alignmentContext != null) + printProgress(lastProcessedMap.alignmentContext.getLocation()); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadPairs.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java new file mode 100644 index 000000000..e392041f0 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java @@ -0,0 +1,256 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.gatk.engine.datasources.providers.ReadReferenceView; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadView; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.ReadWalker; +import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.Iterator; +import java.util.LinkedList; + +/** + * A nano-scheduling version of TraverseReads. + * + * Implements the traversal of a walker that accepts individual reads, the reference, and + * RODs per map call. Directly supports shared memory parallelism via NanoScheduler + * + * @author depristo + * @version 1.0 + * @date 9/2/2012 + */ +public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { + /** our log, which we want to capture anything from this class */ + private final static boolean PRE_READ_ALL_MAP_DATA = true; + protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); + private static final boolean DEBUG = false; + final NanoScheduler nanoScheduler; + + public TraverseReadsNano(int nThreads) { + nanoScheduler = new NanoScheduler(nThreads); + nanoScheduler.setProgressFunction(new NSProgressFunction() { + @Override + public void progress(MapData lastProcessedMap) { + if ( lastProcessedMap.refContext != null ) + // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon + printProgress(lastProcessedMap.refContext.getLocus().getStopLocation()); + } + }); + } + + @Override + public String getTraversalUnits() { + return "reads"; + } + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to traverse with + * @param dataProvider the provider of the reads data + * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function + * @return the reduce variable of the read walker + */ + public T traverse(ReadWalker walker, + ReadShardDataProvider dataProvider, + T sum) { + if ( logger.isDebugEnabled() ) + logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); + + if( !dataProvider.hasReads() ) + throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); + + nanoScheduler.setDebug(DEBUG); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); + final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); + + final Iterator aggregatedInputs = aggregateMapData(dataProvider); + final T result = nanoScheduler.execute(aggregatedInputs, myMap, sum, myReduce); + + return result; + } + + /** + * Aggregate all of the inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + * + * @param dataProvider the source of our data + * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce + * should execute + */ + private Iterator aggregateMapData(final ReadShardDataProvider dataProvider) { + final Iterator it = makeDataIterator(dataProvider); + if ( PRE_READ_ALL_MAP_DATA ) { + final LinkedList l = new LinkedList(); + while ( it.hasNext() ) l.add(it.next()); + return l.iterator(); + } else { + return it; + } + } + + + private Iterator makeDataIterator(final ReadShardDataProvider dataProvider) { + return new Iterator () { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + final Iterator readIterator = reads.iterator(); + + @Override public boolean hasNext() { return ! engine.exceedsRuntimeLimit() && readIterator.hasNext(); } + + @Override + public MapData next() { + final SAMRecord read = readIterator.next(); + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + return new MapData((GATKSAMRecord)read, refContext, tracker); + } + + @Override public void remove() { + throw new UnsupportedOperationException("Remove not supported"); + } + }; + } + + @Override + public void shutdown() { + nanoScheduler.shutdown(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseReadsMap implements NSMapFunction { + final ReadWalker walker; + + private TraverseReadsMap(ReadWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.refContext, data.read); + if (keepMeP) + return new MapResult(walker.map(data.refContext, data.read, data.tracker)); + } + + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseReadsReduce implements NSReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/traversals/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionTraversalParameters.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java new file mode 100644 index 000000000..eb964c826 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java @@ -0,0 +1,196 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import com.google.java.contract.Ensures; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.engine.filters.*; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalSetRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; + +import java.io.PrintStream; +import java.util.*; + +/** + * Base class for all the Active Region Walkers. + * User: rpoplin + * Date: 12/7/11 + */ + +@By(DataSource.READS) +@Requires({DataSource.READS, DataSource.REFERENCE}) +@PartitionBy(PartitionType.READ) +@ActiveRegionTraversalParameters(extension=50,maxRegion=1500) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) +@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) +@RemoveProgramRecords +public abstract class ActiveRegionWalker extends Walker { + /** + * If provided, this walker will write out its activity profile (per bp probabilities of being active) + * to this file in the IGV formatted TAB deliminated output: + * + * http://www.broadinstitute.org/software/igv/IGV + * + * Intended to make debugging the activity profile calculations easier + */ + @Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results in IGV format", required = false, defaultToStdout = false) + public PrintStream activityProfileOutStream = null; + + /** + * If provided, this walker will write out its active and inactive regions + * to this file in the IGV formatted TAB deliminated output: + * + * http://www.broadinstitute.org/software/igv/IGV + * + * Intended to make debugging the active region calculations easier + */ + @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this IGV formatted file", required = false, defaultToStdout = false) + public PrintStream activeRegionOutStream = null; + + @Advanced + @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) + protected List> activeRegionBindings = null; + + @Advanced + @Argument(fullName="activeRegionExtension", shortName="activeRegionExtension", doc="The active region extension; if not provided defaults to Walker annotated default", required = false) + public Integer activeRegionExtension = null; + + /** + * For the active region walker to treat all bases as active. Useful for debugging when you want to force something like + * the HaplotypeCaller to process a specific interval you provide the GATK + */ + @Advanced + @Argument(fullName="forceActive", shortName="forceActive", doc="If provided, all bases will be tagged as active", required = false) + public boolean forceActive = false; + + @Advanced + @Argument(fullName="activeRegionMaxSize", shortName="activeRegionMaxSize", doc="The active region maximum size; if not provided defaults to Walker annotated default", required = false) + public Integer activeRegionMaxSize = null; + + @Advanced + @Argument(fullName="bandPassSigma", shortName="bandPassSigma", doc="The sigma of the band pass filter Gaussian kernel; if not provided defaults to Walker annotated default", required = false) + public Double bandPassSigma = null; + + /* + * For active region limits in ActivityProfile +* */ + @Hidden + @Argument(fullName = "maxProbPropagationDistance", shortName = "maxProbPropDist", minValue = 0, doc="Region probability propagation distance beyond it's maximum size.", required = false) + public Integer maxProbPropagationDistance = 50; + + @Advanced + @Argument(fullName = "activeProbabilityThreshold", shortName = "ActProbThresh", minValue = 0.0, maxValue = 1.0, doc="Threshold for the probability of a profile state being active.", required = false) + public Double activeProbThreshold = 0.002; + + private GenomeLocSortedSet presetActiveRegions = null; + + @Override + public void initialize() { + if( activeRegionBindings == null ) { return; } + List allIntervals = new ArrayList(0); + for ( IntervalBinding intervalBinding : activeRegionBindings ) { + List intervals = intervalBinding.getIntervals(this.getToolkit().getGenomeLocParser()); + + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION); + } + + presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL); + } + + /** + * Does this walker want us to use a set of preset action regions instead of dynamically using the result of isActive? + * @return true if yes, false if no + */ + public boolean hasPresetActiveRegions() { + return presetActiveRegions != null; + } + + /** + * Get the set of preset active regions, or null if none were provided + * @return a set of genome locs specifying fixed active regions requested by the walker, or null if none exist + */ + public GenomeLocSortedSet getPresetActiveRegions() { + return presetActiveRegions; + } + + // Do we actually want to operate on the context? + public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + return true; // We are keeping all the reads + } + + public EnumSet desiredReadStates() { + return EnumSet.of(ActiveRegionReadState.PRIMARY); + } + + public final boolean wantsNonPrimaryReads() { + return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY); + } + + public boolean wantsExtendedReads() { + return desiredReadStates().contains(ActiveRegionReadState.EXTENDED); + } + + public boolean wantsUnmappedReads() { + return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED); + } + + // Determine probability of active status over the AlignmentContext + @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) + public abstract ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); + + // Map over the ActiveRegion + public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker); + + public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) { + final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionTraversalParameters.class).extension(); + final List allIntervals = new ArrayList(); + for( final GenomeLoc interval : intervals.toList() ) { + final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); + final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); + allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); + } + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); + } + + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Allows.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Attribution.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/BAQMode.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/By.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DataSource.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java new file mode 100644 index 000000000..f85123ab6 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java @@ -0,0 +1,47 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; + +import java.lang.annotation.*; + +/** + * Specifies a method for downsampling the reads passed to a given + * walker based on the input from that walker. + * + * @author hanna + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface Downsample { + DownsampleType by(); + int toCoverage() default -1; + double toFraction() default -1.0F; +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java new file mode 100644 index 000000000..42398ec33 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java @@ -0,0 +1,57 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.List; +import java.util.Set; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +@Requires({DataSource.READS,DataSource.REFERENCE}) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class}) +public abstract class DuplicateWalker extends Walker { + // Do we actually want to operate on the context? + public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { + return true; // We are keeping all the reads + } + + public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set> readSets ); + + // Given result of map function + public abstract ReduceType reduceInit(); + public abstract ReduceType reduce(MapType value, ReduceType sum); +} \ No newline at end of file diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/FailMethod.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/FailMethod.java new file mode 100644 index 000000000..3f8862975 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/FailMethod.java @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.samtools.SAMException; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE; + + /** + * Used by QC walkers to test that engine throws appropriate errors. + * Split from the walker in ErrorThrowing.java. + * @param exceptionToThrow Exception type to throw. + */ + public static void fail(final String exceptionToThrow) { + switch (exceptionToThrow) { + case "UserException": + throw new UserException("UserException"); + case "NullPointerException": + throw new NullPointerException(); + case "ReviewedGATKException": + throw new ReviewedGATKException("ReviewedGATKException"); + case "SamError1": + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + case "SamError2": + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + case "NoSpace1": + throw new htsjdk.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + case "NoSpace2": + throw new SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + default: + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } + } +} diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java new file mode 100644 index 000000000..3c6268de3 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; +import org.broadinstitute.gatk.engine.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +@By(DataSource.READS) +@Requires({DataSource.READS,DataSource.REFERENCE}) +@PartitionBy(PartitionType.LOCUS) +@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) +@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) +@RemoveProgramRecords +public abstract class LocusWalker extends Walker { + // Do we actually want to operate on the context? + public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return true; // We are keeping all the reads + } + + // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext + public abstract MapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context); +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/NanoSchedulable.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionBy.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/PartitionType.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RMD.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadFilters.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadPairWalker.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java new file mode 100644 index 000000000..4ace98d61 --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java @@ -0,0 +1,55 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 2:52:28 PM + * To change this template use File | Settings | File Templates. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +@PartitionBy(PartitionType.READ) +public abstract class ReadWalker extends Walker { + public boolean requiresOrderedReads() { return false; } + + // Do we actually want to operate on the context? + /** Must return true for reads that need to be processed. Reads, for which this method return false will + * be skipped by the engine and never passed to the walker. + */ + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + // We are keeping all the reads + return true; + } + + // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext + public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker); +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RefWalker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Reference.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RemoveProgramRecords.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Requires.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/RodWalker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/TreeReducible.java diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java new file mode 100644 index 000000000..a295bb20f --- /dev/null +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java @@ -0,0 +1,177 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.engine.filters.MalformedReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.samples.Sample; +import org.broadinstitute.gatk.engine.samples.SampleDB; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.baq.BAQ; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.engine.recalibration.BQSRMode; + +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Mar 17, 2009 + * Time: 1:53:31 PM + * To change this template use File | Settings | File Templates. + */ +@ReadFilters(MalformedReadFilter.class) +@PartitionBy(PartitionType.NONE) +@Downsample(by = DownsampleType.NONE) +@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) +@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) +@DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class}) +public abstract class Walker { + final protected static Logger logger = Logger.getLogger(Walker.class); + private GenomeAnalysisEngine toolkit; + + protected Walker() { + } + + /** + * Set the toolkit, for peering into internal structures that can't + * otherwise be read. + * @param toolkit The genome analysis toolkit. + */ + public void setToolkit(GenomeAnalysisEngine toolkit) { + this.toolkit = toolkit; + } + + /** + * Retrieve the toolkit, for peering into internal structures that can't + * otherwise be read. Use sparingly, and discuss uses with software engineering + * team. + * @return The genome analysis toolkit. + */ + protected GenomeAnalysisEngine getToolkit() { + return toolkit; + } + + /** + * Gets the master sequence dictionary for this walker + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + protected SAMSequenceDictionary getMasterSequenceDictionary() { + return getToolkit().getMasterSequenceDictionary(); + } + + public SampleDB getSampleDB() { + return getToolkit().getSampleDB(); + } + + protected Sample getSample(final String id) { + return getToolkit().getSampleDB().getSample(id); + } + + /** + * (conceptual static) method that states whether you want to see reads piling up at a locus + * that contain a deletion at the locus. + * + * ref: ATCTGA + * read1: ATCTGA + * read2: AT--GA + * + * Normally, the locus iterator only returns a list of read1 at this locus at position 3, but + * if this function returns true, then the system will return (read1, read2) with offsets + * of (3, -1). The -1 offset indicates a deletion in the read. + * + * @return false if you don't want to see deletions, or true if you do + */ + public boolean includeReadsWithDeletionAtLoci() { + return false; + } + + public void initialize() { } + + /** + * A function for overloading in subclasses providing a mechanism to abort early from a walker. + * + * If this ever returns true, then the Traversal engine will stop executing map calls + * and start the process of shutting down the walker in an orderly fashion. + * @return + */ + public boolean isDone() { + return false; + } + + /** + * Provide an initial value for reduce computations. + * @return Initial value of reduce. + */ + public abstract ReduceType reduceInit(); + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public abstract ReduceType reduce(MapType value, ReduceType sum); + + public void onTraversalDone(ReduceType result) { + logger.info("[REDUCE RESULT] Traversal result is: " + result); + } + + /** + * General interval reduce routine called after all of the traversals are done + * @param results interval reduce results + */ + public void onTraversalDone(List> results) { + for ( Pair result : results ) { + logger.info(String.format("[INTERVAL REDUCE RESULT] at %s ", result.getFirst())); + this.onTraversalDone(result.getSecond()); + } + } + + /** + * Return true if your walker wants to reduce each interval separately. Default is false. + * + * If you set this flag, several things will happen. + * + * The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce + * Reduce will accumulate normally at each map unit in the interval + * However, onTraversalDone(reduce) will be called after each interval is processed. + * The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done, + * which is overloaded here to call onTraversalDone(reduce) for each location + * + * @return true if your walker wants to reduce each interval separately. + */ + public boolean isReduceByInterval() { + return false; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/WalkerName.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java rename to public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/walkers/Window.java diff --git a/public/gatk-tools-public/src/main/resources/GATK_public.key b/public/gatk-engine/src/main/resources/GATK_public.key similarity index 100% rename from public/gatk-tools-public/src/main/resources/GATK_public.key rename to public/gatk-engine/src/main/resources/GATK_public.key diff --git a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key b/public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key similarity index 100% rename from public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key rename to public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_access.key diff --git a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key b/public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key similarity index 100% rename from public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key rename to public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/phonehome/resources/GATK_AWS_secret.key diff --git a/public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/utils/recalibration/BQSR.R b/public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/recalibration/BQSR.R similarity index 100% rename from public/gatk-tools-public/src/main/resources/org/broadinstitute/gatk/utils/recalibration/BQSR.R rename to public/gatk-engine/src/main/resources/org/broadinstitute/gatk/engine/recalibration/BQSR.R diff --git a/public/gatk-tools-public/src/test/java/htsjdk/samtools/GATKBAMFileSpanUnitTest.java b/public/gatk-engine/src/test/java/htsjdk/samtools/GATKBAMFileSpanUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/htsjdk/samtools/GATKBAMFileSpanUnitTest.java rename to public/gatk-engine/src/test/java/htsjdk/samtools/GATKBAMFileSpanUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/htsjdk/samtools/GATKChunkUnitTest.java b/public/gatk-engine/src/test/java/htsjdk/samtools/GATKChunkUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/htsjdk/samtools/GATKChunkUnitTest.java rename to public/gatk-engine/src/test/java/htsjdk/samtools/GATKChunkUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/CommandLineGATKUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java new file mode 100644 index 000000000..65605fec3 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java @@ -0,0 +1,734 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.*; +import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.filters.MappingQualityUnavailableFilter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.broadinstitute.gatk.utils.variant.VCIterable; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.*; +import java.util.*; + +/** + * + */ +public class EngineFeaturesIntegrationTest extends WalkerTest { + private void testBadRODBindingInput(String type, String name, Class c) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -L 1:1 --variant:variant," + type + " " + + b37dbSNP132 + " -R " + b37KGReference + " -o %s", + 1, c); + executeTest(name, spec); + } + + @Test() private void testBadRODBindingInputType1() { + testBadRODBindingInput("beagle", "BEAGLE input to VCF expecting walker", UserException.BadArgumentValue.class); + } + + @Test() private void testBadRODBindingInputType3() { + testBadRODBindingInput("bed", "Bed input to VCF expecting walker", UserException.BadArgumentValue.class); + } + + @Test() private void testBadRODBindingInputTypeUnknownType() { + testBadRODBindingInput("bedXXX", "Unknown input to VCF expecting walker", UserException.UnknownTribbleType.class); + } + + private void testMissingFile(String name, String missingBinding) { + WalkerTestSpec spec = new WalkerTestSpec(missingBinding + " -R " + b37KGReference + " -o %s", + 1, UserException.CouldNotReadInputFile.class); + executeTest(name, spec); + } + + @Test() private void testMissingBAMnt1() { + testMissingFile("missing BAM", "-T TestPrintReadsWalker -I missing.bam -nt 1"); + } + @Test() private void testMissingBAMnt4() { + testMissingFile("missing BAM", "-T TestPrintReadsWalker -I missing.bam -nt 4"); + } + @Test() private void testMissingVCF() { + testMissingFile("missing VCF", "-T TestPrintVariantsWalker -V missing.vcf"); + } + @Test() private void testMissingInterval() { + testMissingFile("missing interval", "-T TestPrintReadsWalker -L missing.interval_list -I " + b37GoodBAM); + } + + + // -------------------------------------------------------------------------------- + // + // Test that our exceptions are coming back as we expect + // + // -------------------------------------------------------------------------------- + + private class EngineErrorHandlingTestProvider extends TestDataProvider { + final Class expectedException; + final String args; + final int iterationsToTest; + + public EngineErrorHandlingTestProvider(Class exceptedException, final String args) { + super(EngineErrorHandlingTestProvider.class); + this.expectedException = exceptedException; + this.args = args; + this.iterationsToTest = args.equals("") ? 1 : 10; + setName(String.format("Engine error handling: expected %s with args %s", exceptedException, args)); + } + } + + @DataProvider(name = "EngineErrorHandlingTestProvider") + public Object[][] makeEngineErrorHandlingTestProvider() { + for ( final FailMethod failMethod : FailMethod.values() ) { + if ( failMethod == FailMethod.TREE_REDUCE ) + continue; // cannot reliably throw errors in TREE_REDUCE + + final String failArg = " -fail " + failMethod.name(); + for ( final String args : Arrays.asList("", " -nt 2", " -nct 2") ) { + new EngineErrorHandlingTestProvider(NullPointerException.class, failArg + args); + new EngineErrorHandlingTestProvider(UserException.class, failArg + args); + new EngineErrorHandlingTestProvider(ReviewedGATKException.class, failArg + args); + } + } + + return EngineErrorHandlingTestProvider.getTests(EngineErrorHandlingTestProvider.class); + } + + // + // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type + // + @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 60 * 1000 ) + public void testEngineErrorHandlingTestProvider(final EngineErrorHandlingTestProvider cfg) { + for ( int i = 0; i < cfg.iterationsToTest; i++ ) { + final String root = "-T TestErrorThrowingWalker -R " + exampleFASTA; + final String args = root + cfg.args + " -E " + cfg.expectedException.getSimpleName(); + WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); + + executeTest(cfg.toString(), spec); + } + } + + // -------------------------------------------------------------------------------- + // + // Test that read filters are being applied in the order we expect + // + // -------------------------------------------------------------------------------- + + @ReadFilters({MappingQualityUnavailableFilter.class}) + public static class DummyReadWalkerWithMapqUnavailableFilter extends ReadWalker { + @Output + PrintStream out; + + @Override + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + return 1; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public void onTraversalDone(Integer result) { + out.println(result); + } + } + + @Test(enabled = true) + public void testUserReadFilterAppliedBeforeWalker() { + WalkerTestSpec spec = new WalkerTestSpec("-R " + b37KGReference + " -I " + privateTestDir + "allMAPQ255.bam" + + " -T DummyReadWalkerWithMapqUnavailableFilter -o %s -L MT -rf ReassignMappingQuality", + 1, Arrays.asList("ecf27a776cdfc771defab1c5d19de9ab")); + executeTest("testUserReadFilterAppliedBeforeWalker", spec); + } + + @Test + public void testNegativeCompress() { + testBadCompressArgument(-1); + } + + @Test + public void testTooBigCompress() { + testBadCompressArgument(100); + } + + private void testBadCompressArgument(final int compress) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintReadsWalker -R " + b37KGReference + " -I " + privateTestDir + "NA12878.1_10mb_2_10mb.bam -o %s -compress " + compress, + 1, UserException.class); + executeTest("badCompress " + compress, spec); + } + + // -------------------------------------------------------------------------------- + // + // Test that the VCF version key is what we expect + // + // -------------------------------------------------------------------------------- + @Test(enabled = true) + public void testGATKVersionInVCF() throws Exception { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf" + + " -o %s -L 20:61098", + 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File vcf = executeTest("testGATKVersionInVCF", spec).first.get(0); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); + final VCFHeaderLine versionLine = header.getMetaDataLine(GATKVCFUtils.GATK_COMMAND_LINE_KEY); + Assert.assertNotNull(versionLine); + Assert.assertTrue(versionLine.toString().contains("TestPrintVariantsWalker")); + } + + @Test(enabled = true) + public void testMultipleGATKVersionsInVCF() throws Exception { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "gatkCommandLineInHeader.vcf" + + " -o %s", + 1, Arrays.asList("")); + spec.disableShadowBCF(); + final File vcf = executeTest("testMultipleGATKVersionsInVCF", spec).first.get(0); + final VCFCodec codec = new VCFCodec(); + final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); + + boolean foundHC = false; + boolean foundPV = false; + for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) { + if ( line.getKey().equals(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) { + if ( line.toString().contains("HaplotypeCaller") ) { + Assert.assertFalse(foundHC); + foundHC = true; + } + if ( line.toString().contains("TestPrintVariantsWalker") ) { + Assert.assertFalse(foundPV); + foundPV = true; + } + } + } + + Assert.assertTrue(foundHC, "Didn't find HaplotypeCaller command line header field"); + Assert.assertTrue(foundPV, "Didn't find TestPrintVariantsWalker command line header field"); + } + + // -------------------------------------------------------------------------------- + // + // Test that defaultBaseQualities actually works + // + // -------------------------------------------------------------------------------- + + public WalkerTestSpec testDefaultBaseQualities(final Integer value, final String md5) { + return new WalkerTestSpec("-T TestPrintReadsWalker -R " + b37KGReference + " -I " + privateTestDir + "/baseQualitiesToFix.bam -o %s" + + (value != null ? " --defaultBaseQualities " + value : ""), + 1, Arrays.asList(md5)); + } + + @Test() + public void testDefaultBaseQualities20() { + executeTest("testDefaultBaseQualities20", testDefaultBaseQualities(20, "7d254a9d0ec59c66ee3e137f56f4c78f")); + } + + @Test() + public void testDefaultBaseQualities30() { + executeTest("testDefaultBaseQualities30", testDefaultBaseQualities(30, "0f50def6cbbbd8ccd4739e2b3998e503")); + } + + @Test(expectedExceptions = Exception.class) + public void testDefaultBaseQualitiesNoneProvided() { + executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, "")); + } + + // -------------------------------------------------------------------------------- + // + // Test engine-level cigar consolidation + // + // -------------------------------------------------------------------------------- + + @Test + public void testGATKEngineConsolidatesCigars() { + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "zero_length_cigar_elements.bam" + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the cigar + + final File outputBam = executeTest("testGATKEngineConsolidatesCigars", spec).first.get(0); + final SAMFileReader reader = new SAMFileReader(outputBam); + reader.setValidationStringency(ValidationStringency.SILENT); + reader.setSAMRecordFactory(new GATKSamRecordFactory()); + + final SAMRecord read = reader.iterator().next(); + reader.close(); + + // Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine: + Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine"); + } + + // -------------------------------------------------------------------------------- + // + // Test on-the-fly sample renaming + // + // -------------------------------------------------------------------------------- + + // On-the-fly sample renaming test case: one single-sample bam with multiple read groups + @Test + public void testOnTheFlySampleRenamingWithSingleBamFile() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithSingleBamFile", spec).first.get(0); + final SAMFileReader reader = new SAMFileReader(outputBam); + + for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), "myNewSampleName", String.format("Sample for read group %s not renamed correctly", readGroup.getId())); + } + + reader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFiles() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam newSampleFor12891", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + final String newSampleName = String.format("newSampleFor%s", inputBamID); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam, + // performing renaming in only SOME of the bams + @Test + public void testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename() throws IOException { + // Rename samples for NA12878 and NA12892, but not for NA12891 + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); + + final Map readGroupToNewSampleMap = new HashMap<>(); + for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { + final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); + final SAMFileReader inputBamReader = new SAMFileReader(inputBam); + + // Special-case NA12891, which we're not renaming: + final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID); + + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); + } + inputBamReader.close(); + } + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: two single-sample bams with read group collisions + @Test + public void testOnTheFlySampleRenamingWithReadGroupCollisions() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878")); + + final Set na12878ReadGroups = new HashSet<>(); + final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam")); + for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { + na12878ReadGroups.add(readGroup.getId()); + } + inputBamReader.close(); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, Arrays.asList("")); // No MD5s; we only want to check the read groups + + final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0); + final SAMFileReader outputBamReader = new SAMFileReader(outputBam); + + int totalReadGroupsSeen = 0; + for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { + String expectedSampleName = ""; + if ( na12878ReadGroups.contains(readGroup.getId()) ) { + expectedSampleName = "newSampleFor12878"; + } + else { + expectedSampleName = "newSampleForNot12878"; + } + + Assert.assertEquals(readGroup.getSample(), expectedSampleName, + String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); + totalReadGroupsSeen++; + } + + Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file"); + + outputBamReader.close(); + } + + // On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException) + @Test + public void testOnTheFlySampleRenamingWithMultiSampleBam() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintReadsWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingWithMultiSampleBam", spec); + } + + // On-the-fly sample renaming test case: ensure that walkers can see the remapped sample names in individual reads + @Test + public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam myNewSampleName")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingTestWalker" + + " -R " + b37KGReference + + " -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " --newSampleName myNewSampleName" + + " -L 20:10000000-10001000", + 1, Arrays.asList("")); + + // Test is a success if our custom walker doesn't throw an exception + executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads", spec); + } + + @Test + public void testOnTheFlySampleRenamingSingleSampleVCF() throws IOException { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf newSampleForNA12878")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintVariantsWalker" + + " -R " + b37KGReference + + " -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + Arrays.asList("")); // No MD5s -- we will inspect the output file manually + + final File outputVCF = executeTest("testOnTheFlySampleRenamingSingleSampleVCF", spec).first.get(0); + verifySampleRenaming(outputVCF, "newSampleForNA12878"); + } + + private void verifySampleRenaming( final File outputVCF, final String newSampleName ) throws IOException { + final Pair> headerAndVCIter = VCIterable.readAllVCs(outputVCF, new VCFCodec()); + final VCFHeader header = headerAndVCIter.getFirst(); + final VCIterable iter = headerAndVCIter.getSecond(); + + // Verify that sample renaming occurred at both the header and record levels (checking only the first 10 records): + + Assert.assertEquals(header.getGenotypeSamples().size(), 1, "Wrong number of samples in output vcf header"); + Assert.assertEquals(header.getGenotypeSamples().get(0), newSampleName, "Wrong sample name in output vcf header"); + + int recordCount = 0; + while ( iter.hasNext() && recordCount < 10 ) { + final VariantContext vcfRecord = iter.next(); + Assert.assertEquals(vcfRecord.getSampleNames().size(), 1, "Wrong number of samples in output vcf record"); + Assert.assertEquals(vcfRecord.getSampleNames().iterator().next(), newSampleName, "Wrong sample name in output vcf record"); + recordCount++; + } + } + + @Test + public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInVCFRecords() throws Exception { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "samplerenametest_single_sample_gvcf.vcf FOOSAMPLE")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingRodWalker" + + " -R " + hg19Reference + + " -V " + privateTestDir + "samplerenametest_single_sample_gvcf.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " --expectedSampleName FOOSAMPLE" + + " -o %s", + 1, + Arrays.asList("")); // No MD5s -- custom walker will throw an exception if there's a problem + + executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInVCFRecords", spec); + } + + @Test + public void testOnTheFlySampleRenamingMultiSampleVCF() throws Exception { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "vcf/vcfWithGenotypes.vcf badSample")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintVariantsWalker" + + " -R " + b37KGReference + + " -V " + privateTestDir + "vcf/vcfWithGenotypes.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingMultiSampleVCF", spec); + } + + @Test + public void testOnTheFlySampleRenamingSitesOnlyVCF() throws Exception { + final File sampleRenameMapFile = createTestSampleRenameMapFile( + Arrays.asList(privateTestDir + "vcf/vcfWithoutGenotypes.vcf badSample")); + + final WalkerTestSpec spec = new WalkerTestSpec(" -T TestPrintVariantsWalker" + + " -R " + b37KGReference + + " -V " + privateTestDir + "vcf/vcfWithoutGenotypes.vcf" + + " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + + " -o %s", + 1, + UserException.class); // expecting a UserException here + + executeTest("testOnTheFlySampleRenamingSitesOnlyVCF", spec); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + public static class OnTheFlySampleRenamingVerifyingTestWalker extends ReadWalker { + @Argument(fullName = "newSampleName", shortName = "newSampleName", doc = "", required = true) + String newSampleName = null; + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + if ( ! newSampleName.equals(read.getReadGroup().getSample()) ) { + throw new IllegalStateException(String.format("Encountered read with the wrong sample name. Expected %s found %s", + newSampleName, read.getReadGroup().getSample())); + } + + return 1; + } + + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + } + + public static class OnTheFlySampleRenamingVerifyingRodWalker extends RodWalker { + @Argument(fullName = "expectedSampleName", shortName = "expectedSampleName", doc = "", required = true) + String expectedSampleName = null; + + @Output + PrintStream out; + + @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) + public RodBinding variants; + + public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { + if ( tracker == null ) { + return 0; + } + + for ( final VariantContext vc : tracker.getValues(variants, context.getLocation()) ) { + if ( vc.getSampleNames().size() != 1 ) { + throw new IllegalStateException("Encountered a vcf record with num samples != 1"); + } + + final String actualSampleName = vc.getSampleNames().iterator().next(); + if ( ! expectedSampleName.equals(actualSampleName)) { + throw new IllegalStateException(String.format("Encountered vcf record with wrong sample name. Expected %s found %s", + expectedSampleName, actualSampleName)); + } + } + + return 1; + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + } + + // -------------------------------------------------------------------------------- + // + // Test output file-specific options + // + // -------------------------------------------------------------------------------- + + //Returns the output file + private File testBAMFeatures(final String args, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintReadsWalker -R " + b37KGReference + + " -I " + privateTestDir + "NA20313.highCoverageRegion.bam" + + " --no_pg_tag -o %s " + args, + 1, Arrays.asList(".bam"), Arrays.asList(md5)); + return executeTest("testBAMFeatures: "+args, spec).first.get(0); + } + + @Test + public void testSAMWriterFeatures() { + testBAMFeatures("-compress 0", "bb4b55b1f80423970bb9384cbf0d8793"); + testBAMFeatures("-compress 9", "b85ee1636d62e1bb8ed65a245c307167"); + testBAMFeatures("-simplifyBAM", "38f9c30a27dfbc085a2ff52a1617d579"); + + //Validate MD5 + final String expectedMD5 = "6627b9ea33293a0083983feb94948c1d"; + final File md5Target = testBAMFeatures("--generate_md5", expectedMD5); + final File md5File = new File(md5Target.getAbsoluteFile() + ".md5"); + md5File.deleteOnExit(); + Assert.assertTrue(md5File.exists(), "MD5 wasn't created"); + try { + String md5 = new BufferedReader(new FileReader(md5File)).readLine(); + Assert.assertEquals(md5, expectedMD5, "Generated MD5 doesn't match expected"); + } catch (IOException e) { + Assert.fail("Can't parse MD5 file", e); + } + + //Validate that index isn't created + final String unindexedBAM = testBAMFeatures("--disable_bam_indexing", expectedMD5).getAbsolutePath(); + Assert.assertTrue(!(new File(unindexedBAM+".bai").exists()) && + !(new File(unindexedBAM.replace(".bam", ".bai")).exists()), + "BAM index was created even though it was disabled"); + } + + @DataProvider(name = "vcfFeaturesData") + public Object[][] getVCFFeaturesData() { + return new Object[][]{ + {"--sites_only", "94bf1f2c0946e933515e4322323a5716"}, + {"--bcf", "03f2d6988f54a332da48803c78f9c4b3"} + }; + } + + @Test(dataProvider = "vcfFeaturesData") + public void testVCFFeatures(final String args, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "CEUtrioTest.vcf" + + " --no_cmdline_in_header -o %s " + args, + 1, Arrays.asList(md5)); + executeTest("testVCFFeatures: "+args, spec); + } + + @DataProvider(name = "vcfFormatHandlingData") + public Object[][] getVCFFormatHandlingData() { + return new Object[][]{ + {true, "95b6262efbd40b6b72f44f808f3e4c45"}, + {false, "333232e08b8cdd3303309e438c44277f"} + }; + } + + @Test(dataProvider = "vcfFormatHandlingData") + public void testVCFFormatHandling(final boolean writeFullFormat, final String md5) { + WalkerTestSpec spec = new WalkerTestSpec("-T TestPrintVariantsWalker -R " + b37KGReference + + " -V " + privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf" + + " --no_cmdline_in_header -o %s " + + " --fullyDecode " //Without this parameter, the FORMAT fields will be emitted unchanged. Oops + + (writeFullFormat ? "-writeFullFormat" : "") , + 1, Arrays.asList(md5)); + executeTest("testVCFFormatHandling: "+(writeFullFormat ? "Untrimmed" : "Trimmed"), spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GATKVCFUtilsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GATKVCFUtilsUnitTest.java new file mode 100644 index 000000000..cd8b19bd0 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GATKVCFUtilsUnitTest.java @@ -0,0 +1,138 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.tribble.index.DynamicIndexCreator; +import htsjdk.tribble.index.IndexCreator; +import htsjdk.tribble.index.interval.IntervalIndexCreator; +import htsjdk.tribble.index.linear.LinearIndexCreator; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.RodWalker; +import org.broadinstitute.gatk.engine.walkers.Walker; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.Collections; +import java.util.Set; + +public class GATKVCFUtilsUnitTest extends BaseTest { + public static class VCFHeaderTestWalker extends RodWalker { + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return null; } + public Integer reduceInit() { return 0; } + public Integer reduce(Integer value, Integer sum) { return value + sum; } + } + + public static class VCFHeaderTest2Walker extends VCFHeaderTestWalker {} + + @Test + public void testAddingVCFHeaderInfo() { + final VCFHeader header = new VCFHeader(); + + final Walker walker1 = new VCFHeaderTestWalker(); + final Walker walker2 = new VCFHeaderTest2Walker(); + + final GenomeAnalysisEngine testEngine1 = new GenomeAnalysisEngine(); + testEngine1.setWalker(walker1); + + final GenomeAnalysisEngine testEngine2 = new GenomeAnalysisEngine(); + testEngine2.setWalker(walker2); + + final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine1, Collections.EMPTY_LIST); + logger.warn(line1); + Assert.assertNotNull(line1); + Assert.assertEquals(line1.getKey(), GATKVCFUtils.GATK_COMMAND_LINE_KEY); + for ( final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions")) + Assert.assertTrue(line1.toString().contains(field), "Couldn't find field " + field + " in " + line1.getValue()); + Assert.assertTrue(line1.toString().contains("ID=" + testEngine1.getWalkerName())); + + final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine2, Collections.EMPTY_LIST); + logger.warn(line2); + + header.addMetaDataLine(line1); + final Set lines1 = header.getMetaDataInInputOrder(); + Assert.assertTrue(lines1.contains(line1)); + + header.addMetaDataLine(line2); + final Set lines2 = header.getMetaDataInInputOrder(); + Assert.assertTrue(lines2.contains(line1)); + Assert.assertTrue(lines2.contains(line2)); + } + + private class IndexCreatorTest extends TestDataProvider { + private final GATKVCFIndexType type; + private final int parameter; + private final Class expectedClass; + private final Integer expectedDimension; + private final Method dimensionGetter; + + private IndexCreatorTest(GATKVCFIndexType type, int parameter, Class expectedClass, Integer expectedDimension, + String dimensionGetterName) { + super(IndexCreatorTest.class); + + this.type = type; + this.parameter = parameter; + this.expectedClass = expectedClass; + this.expectedDimension = expectedDimension; + try { + // Conditional matches testGetIndexCreator's if-statement + this.dimensionGetter = this.expectedDimension == null ? null : expectedClass.getDeclaredMethod(dimensionGetterName); + } catch (NoSuchMethodException e) { + throw new RuntimeException(e); + } + } + } + + @DataProvider(name = "indexCreator") + public Object[][] indexCreatorData() { + new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0, DynamicIndexCreator.class, null, null); + new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0, DynamicIndexCreator.class, null, null); + new IndexCreatorTest(GATKVCFIndexType.LINEAR, 100, LinearIndexCreator.class, 100, "getBinSize"); + new IndexCreatorTest(GATKVCFIndexType.INTERVAL, 200, IntervalIndexCreator.class, 200, "getFeaturesPerInterval"); + + return IndexCreatorTest.getTests(IndexCreatorTest.class); + } + + @Test(dataProvider = "indexCreator") + public void testGetIndexCreator(IndexCreatorTest spec) throws Exception{ + File dummy = new File(""); + IndexCreator ic = GATKVCFUtils.getIndexCreator(spec.type, spec.parameter, dummy); + Assert.assertEquals(ic.getClass(), spec.expectedClass, "Wrong IndexCreator type"); + if (spec.expectedDimension != null) { + Integer dimension = (int)spec.dimensionGetter.invoke(ic); + Assert.assertEquals(dimension, spec.expectedDimension, "Wrong dimension"); + } + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java new file mode 100644 index 000000000..424083a11 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java @@ -0,0 +1,272 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.TestCountReadsWalker; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +/** + * Tests selected functionality in the GenomeAnalysisEngine class + */ +public class GenomeAnalysisEngineUnitTest extends BaseTest { + + @Test(expectedExceptions=UserException.class) + public void testEmptySamFileListHandling() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + testEngine.setWalker(new TestCountReadsWalker()); //generalizable to any walker requiring reads + + //supply command line args so validateSuppliedReads() knows whether reads were passed in + GATKArgumentCollection testArgs = new GATKArgumentCollection(); + testArgs.samFiles.add("empty.list"); + testEngine.setArguments(testArgs); + + //represents the empty list of samFiles read in from empty.list by CommandLineExecutable + Collection samFiles = new ArrayList(); + + testEngine.setSAMFileIDs(samFiles); + testEngine.validateSuppliedReads(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleNORG.bam"), new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test(expectedExceptions=UserException.class) + public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + final File relativePathToBAMFile = new File(publicTestDir + "exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + Collection samFiles = new ArrayList(); + samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); + samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); + + testEngine.setSAMFileIDs(samFiles); + testEngine.checkForDuplicateSamFiles(); + } + + @Test + public void testEmptyIntervalSetHandling() throws Exception { + GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); + + GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + + testEngine.setWalker(new TestCountReadsWalker()); + testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); + + testEngine.validateSuppliedIntervals(); + } + + @Test + public void testLoadWellFormedSampleRenameMapFile() throws IOException { + final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", + "/foo/bar/second.bam newSample2", + "/foo/bar2/third.bam newSample3", + "/foo/bar2/fourth.bam new sample 4", + "/foo/bar2/fifth.bam new sample 5 ")); + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + + Assert.assertEquals(renameMap.size(), 5, "Sample rename map was wrong size after loading from file"); + + final Iterator expectedResultsIterator = Arrays.asList( + "/foo/bar/first.bam", "newSample1", + "/foo/bar/second.bam", "newSample2", + "/foo/bar2/third.bam", "newSample3", + "/foo/bar2/fourth.bam", "new sample 4", + "/foo/bar2/fifth.bam", "new sample 5" + ).iterator(); + while ( expectedResultsIterator.hasNext() ) { + final String expectedKey = expectedResultsIterator.next(); + final String expectedValue = expectedResultsIterator.next(); + + Assert.assertNotNull(renameMap.get(expectedKey), String.format("Entry for %s not found in sample rename map", expectedKey)); + Assert.assertEquals(renameMap.get(expectedKey), expectedValue, "Wrong value in sample rename map for " + expectedKey); + } + } + + @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") + public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { + final List tests = new ArrayList(); + + tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", + new File("/foo/bar/nonexistent")}); + tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine", + createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", + createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", + createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", + "/path/to/dupe.bam newSample2"))}); + tests.add(new Object[]{"testLoadSampleRenameMapFileTabInSampleName", + createTestSampleRenameMapFile(Arrays.asList("/path/to/stuff.bam some wonky\tsample "))}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) + public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { + logger.info("Executing test " + testName); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + final Map renameMap = engine.loadSampleRenameMap(mapFile); + } + + private File createTestSampleRenameMapFile( final List contents ) throws IOException { + final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); + final PrintWriter writer = new PrintWriter(mapFile); + + for ( final String line : contents ) { + writer.println(line); + } + writer.close(); + + return mapFile; + } + + /////////////////////////////////////////////////// + // Test the ReadTransformer ordering enforcement // + /////////////////////////////////////////////////// + + public static class TestReadTransformer extends ReadTransformer { + + private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; + private boolean enabled; + + protected TestReadTransformer(final OrderingConstraint orderingConstraint) { + this.orderingConstraint = orderingConstraint; + enabled = true; + } + + // need this because PackageUtils will pick up this class as a possible ReadTransformer + protected TestReadTransformer() { + enabled = false; + } + + @Override + public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } + + @Override + public boolean enabled() { return enabled; } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } + + } + + @DataProvider(name = "ReadTransformerData") + public Object[][] makeReadTransformerData() { + List tests = new ArrayList(); + + for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { + for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { + tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "ReadTransformerData") + public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { + + final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); + final List readTransformers = new ArrayList(3); + readTransformers.add(new TestReadTransformer(oc1)); + readTransformers.add(new TestReadTransformer(oc2)); + readTransformers.add(new TestReadTransformer(oc3)); + + final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || + numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; + + try { + testEngine.setReadTransformers(readTransformers); + + Assert.assertFalse(shouldThrowException); + Assert.assertEquals(testEngine.getReadTransformers().size(), 3); + + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); + Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); + } catch (UserException.IncompatibleReadFiltersException e) { + Assert.assertTrue(shouldThrowException); + } + } + + private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { + int count = 0; + for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { + if ( constraint == target ) + count++; + } + return count; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/InstantiableWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/InstantiableWalker.java new file mode 100644 index 000000000..4c6e35d0c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/InstantiableWalker.java @@ -0,0 +1,37 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.commandline.Hidden; + +@Hidden +public class InstantiableWalker extends Walker { + // Public constructor will generate instantiable message + public InstantiableWalker() {} + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return 0L; } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java new file mode 100644 index 000000000..2d48487e4 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java @@ -0,0 +1,151 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.LocusWalker; +import org.broadinstitute.gatk.utils.SimpleTimer; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** + * + */ +public class MaxRuntimeIntegrationTest extends WalkerTest { + public static class SleepingWalker extends LocusWalker { + @Output PrintStream out; + + @Argument(fullName="sleepTime",shortName="sleepTime",doc="x", required=false) + public int sleepTime = 100; + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + try {Thread.sleep(sleepTime);} catch (InterruptedException e) {}; + return 1; + } + + @Override public Integer reduceInit() { return 0; } + @Override public Integer reduce(Integer value, Integer sum) { return sum + value; } + + @Override + public void onTraversalDone(Integer result) { + out.println(result); + } + } + + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); + + private class MaxRuntimeTestProvider extends TestDataProvider { + final long maxRuntime; + final TimeUnit unit; + + public MaxRuntimeTestProvider(final long maxRuntime, final TimeUnit unit) { + super(MaxRuntimeTestProvider.class); + this.maxRuntime = maxRuntime; + this.unit = unit; + setName(String.format("Max runtime test : %d of %s", maxRuntime, unit)); + } + + public long expectedMaxRuntimeNano() { + return TimeUnit.NANOSECONDS.convert(maxRuntime, unit) + STARTUP_TIME; + } + } + + @DataProvider(name = "MaxRuntimeProvider") + public Object[][] makeMaxRuntimeProvider() { + for ( final TimeUnit requestedUnits : Arrays.asList(TimeUnit.NANOSECONDS, TimeUnit.MILLISECONDS, TimeUnit.SECONDS, TimeUnit.MINUTES) ) + new MaxRuntimeTestProvider(requestedUnits.convert(30, TimeUnit.SECONDS), requestedUnits); + + return MaxRuntimeTestProvider.getTests(MaxRuntimeTestProvider.class); + } + + // + // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type + // + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 120 * 1000) + public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker -R " + hg18Reference + + " -I " + validationDataLocation + "NA12878.WEx.downsampled20x.bam -o /dev/null" + + " -maxRuntime " + cfg.maxRuntime + " -maxRuntimeUnits " + cfg.unit, 0, + Collections.emptyList()); + final SimpleTimer timer = new SimpleTimer().start(); + executeTest("Max runtime " + cfg, spec); + final long actualRuntimeNano = timer.getElapsedTimeNano(); + + Assert.assertTrue(actualRuntimeNano < cfg.expectedMaxRuntimeNano(), + "Actual runtime " + TimeUnit.SECONDS.convert(actualRuntimeNano, TimeUnit.NANOSECONDS) + + " exceeded max. tolerated runtime " + TimeUnit.SECONDS.convert(cfg.expectedMaxRuntimeNano(), TimeUnit.NANOSECONDS) + + " given requested runtime " + cfg.maxRuntime + " " + cfg.unit); + } + + @DataProvider(name = "SubshardProvider") + public Object[][] makeSubshardProvider() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{10}); + tests.add(new Object[]{100}); + tests.add(new Object[]{500}); + tests.add(new Object[]{1000}); + tests.add(new Object[]{2000}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "SubshardProvider", timeOut = 120 * 1000) + public void testSubshardTimeout(final int sleepTime) throws Exception { + final int maxRuntime = 5000; + + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SleepingWalker -R " + b37KGReference + + " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam -o %s" + + " -maxRuntime " + maxRuntime + " -maxRuntimeUnits MILLISECONDS -sleepTime " + sleepTime, 1, + Collections.singletonList("")); + final File result = executeTest("Subshard max runtime ", spec).getFirst().get(0); + final int cycle = Integer.valueOf(new BufferedReader(new FileReader(result)).readLine()); + + final int maxCycles = (int)Math.ceil((maxRuntime * 5) / sleepTime); + logger.warn(String.format("Max cycles %d saw %d in file %s with sleepTime %d and maxRuntime %d", maxCycles, cycle, result, sleepTime, maxRuntime)); + Assert.assertTrue(cycle < maxCycles, "Too many cycles seen -- saw " + cycle + " in file " + result + " but max should have been " + maxCycles); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java new file mode 100644 index 000000000..dcc58ea36 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java @@ -0,0 +1,370 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.executive.WindowMaker; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.traversals.*; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.*; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class ReadMetricsUnitTest extends BaseTest { + + @Test + public void testReadsSeenDoNotOverflowInt() { + + final ReadMetrics metrics = new ReadMetrics(); + + final long moreThanMaxInt = ((long)Integer.MAX_VALUE) + 1L; + + for ( long i = 0L; i < moreThanMaxInt; i++ ) { + metrics.incrementNumReadsSeen(); + } + + Assert.assertEquals(metrics.getNumReadsSeen(), moreThanMaxInt); + Assert.assertTrue(metrics.getNumReadsSeen() > (long) Integer.MAX_VALUE); + + logger.warn(String.format("%d %d %d", Integer.MAX_VALUE, moreThanMaxInt, Long.MAX_VALUE)); + } + + + // Test the accuracy of the read metrics + + private IndexedFastaSequenceFile reference; + private SAMSequenceDictionary dictionary; + private SAMFileHeader header; + private GATKSAMReadGroupRecord readGroup; + private GenomeLocParser genomeLocParser; + private File testBAM; + + private static final int numReadsPerContig = 250000; + private static final List contigs = Arrays.asList("1", "2", "3"); + + @BeforeClass + private void init() throws IOException { + reference = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); + header.setSequenceDictionary(dictionary); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + readGroup = new GATKSAMReadGroupRecord(header.getReadGroup("test")); + + final List reads = new ArrayList<>(); + for ( final String contig : contigs ) { + for ( int i = 1; i <= numReadsPerContig; i++ ) { + reads.add(buildSAMRecord("read" + contig + "_" + i, contig, i)); + } + } + + createBAM(reads); + } + + private void createBAM(final List reads) throws IOException { + testBAM = createTempFile("TraverseActiveRegionsUnitTest", ".bam"); + + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); + for (GATKSAMRecord read : reads ) { + out.addAlignment(read); + } + out.close(); + + new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); + } + + // copied from LocusViewTemplate + protected GATKSAMRecord buildSAMRecord(final String readName, final String contig, final int alignmentStart) { + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(dictionary.getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + + record.setCigarString("1M"); + record.setReadString("A"); + record.setBaseQualityString("A"); + record.setReadGroup(readGroup); + + return record; + } + + @Test + public void testCountsFromReadTraversal() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + + final TraverseReadsNano traverseReadsNano = new TraverseReadsNano(1); + final DummyReadWalker walker = new DummyReadWalker(); + traverseReadsNano.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()) ) { + final ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard, engine.getGenomeLocParser(), dataSource.seek(shard), reference, new ArrayList()); + traverseReadsNano.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + + Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); + Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); + } + + @Test + public void testCountsFromLocusTraversal() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + final Set samples = ReadUtils.getSAMFileSamples(dataSource.getHeader()); + + final TraverseLociNano traverseLociNano = new TraverseLociNano(1); + final DummyLocusWalker walker = new DummyLocusWalker(); + traverseLociNano.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new LocusShardBalancer()) ) { + final WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples); + for ( WindowMaker.WindowMakerIterator window : windowMaker ) { + final LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList()); + traverseLociNano.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + windowMaker.close(); + } + + //dataSource.close(); + Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); + Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); + } + + @Test + public void testCountsFromActiveRegionTraversal() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + final Set samples = ReadUtils.getSAMFileSamples(dataSource.getHeader()); + + final List intervals = new ArrayList<>(contigs.size()); + for ( final String contig : contigs ) + intervals.add(genomeLocParser.createGenomeLoc(contig, 1, numReadsPerContig)); + + final TraverseActiveRegions traverseActiveRegions = new TraverseActiveRegions(); + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + traverseActiveRegions.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer()) ) { + final WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples); + for ( WindowMaker.WindowMakerIterator window : windowMaker ) { + final LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList()); + traverseActiveRegions.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + windowMaker.close(); + } + + Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); + Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); + } + + @Test + public void testFilteredCounts() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + final Collection samFiles = new ArrayList<>(); + final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); + samFiles.add(readerID); + + final List filters = new ArrayList<>(); + filters.add(new EveryTenthReadFilter()); + + final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + filters, + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + + final TraverseReadsNano traverseReadsNano = new TraverseReadsNano(1); + final DummyReadWalker walker = new DummyReadWalker(); + traverseReadsNano.initialize(engine, walker, null); + + for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()) ) { + final ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard, engine.getGenomeLocParser(), dataSource.seek(shard), reference, new ArrayList()); + traverseReadsNano.traverse(walker, dataProvider, 0); + dataProvider.close(); + } + + Assert.assertEquals((long)engine.getCumulativeMetrics().getCountsByFilter().get(EveryTenthReadFilter.class.getSimpleName()), contigs.size() * numReadsPerContig / 10); + } + + class DummyLocusWalker extends LocusWalker { + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class DummyReadWalker extends ReadWalker { + @Override + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class DummyActiveRegionWalker extends ActiveRegionWalker { + @Override + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new ActivityProfileState(ref.getLocus(), 0.0); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + private final class EveryTenthReadFilter extends ReadFilter { + + private int myCounter = 0; + + @Override + public boolean filterOut(final SAMRecord record) { + if ( ++myCounter == 10 ) { + myCounter = 0; + return true; + } + + return false; + } + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/SampleUtilsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/SampleUtilsUnitTest.java new file mode 100644 index 000000000..7de5f0dbf --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/SampleUtilsUnitTest.java @@ -0,0 +1,49 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.SampleUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Testing framework for sample utilities class. + * + * @author gauthier + */ + +public class SampleUtilsUnitTest extends BaseTest { + @Test(expectedExceptions=UserException.class) + public void testBadSampleFiles() throws Exception { + Set sampleFiles = new HashSet(0); + sampleFiles.add(new File("fileNotHere.samples")); + Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/UninstantiableWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/UninstantiableWalker.java new file mode 100644 index 000000000..11a3c3d6d --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/UninstantiableWalker.java @@ -0,0 +1,37 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.commandline.Hidden; + +@Hidden +public class UninstantiableWalker extends Walker { + // Private constructor will generate uninstantiable message + private UninstantiableWalker() {} + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return 0L; } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java new file mode 100644 index 000000000..0a940ef22 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine; + +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +/** + * Tests basic functionality of the walker manager. + */ +public class WalkerManagerUnitTest { + private static WalkerManager walkerManager; + + @BeforeClass + public void setUp() { + walkerManager = new WalkerManager(); + } + + @Test + public void testPresentWalker() { + Walker instantiableWalker = walkerManager.createByName("InstantiableWalker"); + Assert.assertEquals(InstantiableWalker.class, instantiableWalker.getClass()); + } + + @Test(expectedExceptions=UserException.class) + public void testAbsentWalker() { + walkerManager.createByName("Missing"); + } + + @Test(expectedExceptions=DynamicClassResolutionException.class) + public void testUninstantiableWalker() { + walkerManager.createByName("UninstantiableWalker"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/IntervalIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/IntervalIntegrationTest.java new file mode 100644 index 000000000..1229ecfff --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/IntervalIntegrationTest.java @@ -0,0 +1,304 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; + +/** + * Test the GATK core interval parsing mechanism. + */ +public class IntervalIntegrationTest extends WalkerTest { + @Test(enabled = true) + public void testAllImplicitIntervalParsing() { + String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testAllIntervalsImplicit",spec); + } + +// '-L all' is no longer supported +// @Test(enabled = true) +// public void testAllExplicitIntervalParsing() { +// String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; +// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( +// "-T TestCountLociWalker" + +// " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + +// " -R " + hg18Reference + +// " -L all" + +// " -o %s", +// 1, // just one output file +// Arrays.asList(md5)); +// executeTest("testAllIntervalsExplicit",spec); +// } + + @Test + public void testUnmappedReadInclusion() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker" + + " -I " + validationDataLocation + "MV1994.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " -L unmapped" + + " -U", + 0, // two output files + Collections.emptyList()); + + // our base file + File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); + spec.setOutputFileLocation(baseOutputFile); + spec.addAuxFile("95e98192e5b90cf80eaa87a4ace263da",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("fadcdf88597b9609c5f2a17f4c6eb455", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + + executeTest("testUnmappedReadInclusion",spec); + } + + @Test + public void testMixedMappedAndUnmapped() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker" + + " -I " + validationDataLocation + "MV1994.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " -L Escherichia_coli_K12:4630000-4639675" + + " -L unmapped" + + " -U", + 0, // two output files + Collections.emptyList()); + + // our base file + File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); + spec.setOutputFileLocation(baseOutputFile); + spec.addAuxFile("3944b5a6bfc06277ed3afb928a20d588",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("fa90ff91ac0cc689c71a3460a3530b8b", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + + executeTest("testUnmappedReadInclusion",spec); + } + + + @Test(enabled = false) + public void testUnmappedReadExclusion() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker" + + " -I " + validationDataLocation + "MV1994.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " -XL unmapped" + + " -U", + 0, // two output files + Collections.emptyList()); + + // our base file + File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); + spec.setOutputFileLocation(baseOutputFile); + spec.addAuxFile("80887ba488e53dabd9596ff93070ae75",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + + executeTest("testUnmappedReadExclusion",spec); + } + + @Test(enabled = true) + public void testIntervalParsingFromFile() { + String md5 = "48a24b70a0b376535542b996af517398"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalParsingFromFile", spec); + } + + @Test(enabled = true) + public void testIntervalMergingFromFiles() { + String md5 = "9ae0ea9e3c9c6e1b9b6252c8395efdc1"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -L " + validationDataLocation + "intervalTest.2.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalMergingFromFiles", spec); + } + + @Test(enabled = true) + public void testIntervalExclusionsFromFiles() { + String md5 = "26ab0db90d72e28ad0ba1e22ee510510"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.2.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalExclusionsFromFiles", spec); + } + + @Test(enabled = true) + public void testMixedIntervalMerging() { + String md5 = "7c5aba41f53293b712fd86d08ed5b36e"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -L chr1:1677524-1677528", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testMixedIntervalMerging", spec); + } + + @Test(enabled = true) + public void testBed() { + String md5 = "cf4278314ef8e4b996e1b798d8eb92cf"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.bed", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testBed", spec); + } + + @Test(enabled = true) + public void testComplexVCF() { + String md5 = "166d77ac1b46a1ec38aa35ab7e628ab5"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testComplexVCF", spec); + } + + @Test(enabled = true) + public void testComplexVCFWithPadding() { + String md5 = "649ee93d50739c656e94ec88a32c7ffe"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " --interval_padding 2" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testComplexVCFWithPadding", spec); + } + + @Test(enabled = true) + public void testMergingWithComplexVCF() { + String md5 = "6d7fce9fee471194aa8b5b6e47267f03"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testMergingWithComplexVCF", spec); + } + + @Test(enabled = true) + public void testEmptyVCF() { + String md5 = "897316929176464ebc9ad085f31e7284"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.empty.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testEmptyVCFWarning", spec); + } + + @Test(enabled = true) + public void testIncludeExcludeIsTheSame() { + String md5 = "897316929176464ebc9ad085f31e7284"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIncludeExcludeIsTheSame", spec); + } + + @Test(enabled = true) + public void testSymbolicAlleles() { + String md5 = "52745056d2fd5904857bbd4984c08098"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker" + + " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" + + " -R " + b36KGReference + + " -o %s" + + " -L " + privateTestDir + "symbolic_alleles_1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testSymbolicAlleles", spec); + } + + @Test + public void testIntersectionOfLexicographicallySortedIntervals() { + final String md5 = "18be9375e5a753f766616a51eb6131f0"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T TestCountLociWalker" + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -R " + b37KGReference + + " -L " + privateTestDir + "lexicographicallySortedIntervals.bed" + + " -L 4" + + " -isr INTERSECTION" + + " -o %s", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntersectionOfLexicographicallySortedIntervals", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/InvalidArgumentIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..ca9682747 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/InvalidArgumentIntegrationTest.java @@ -0,0 +1,55 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + +public class InvalidArgumentIntegrationTest extends WalkerTest { + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter", + new WalkerTest.WalkerTestSpec( + " -T TestPrintReadsWalker" + + " -R " + exampleFASTA + + " -I " + publicTestDir + "exampleBAM.bam" + + " -o %s" + + " -rf TestUnknownReadFilter", + 1, UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec( + " -T UnknownWalkerName" + + " -R " + exampleFASTA + + " -I " + publicTestDir + "exampleBAM.bam" + + " -o %s", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/LoggingIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/LoggingIntegrationTest.java new file mode 100644 index 000000000..73c177688 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/arguments/LoggingIntegrationTest.java @@ -0,0 +1,117 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.arguments; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Level; + +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.MD5DB; +import org.broadinstitute.gatk.utils.MD5Mismatch; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.utils.runtime.*; + +public class LoggingIntegrationTest { + private final MD5DB md5db = new MD5DB(); + + private class LoggingTestProvider extends BaseTest.TestDataProvider { + + private final String baseCmdLine; + + private final Level logLevel; + private final String logFileStr; + public final File argumentOutputFile; + public final File pipedOutputFile; + + private LoggingTestProvider(final Level logLevel, final boolean explicitLogfile) throws IOException { + super(LoggingTestProvider.class); + + // TODO: a better command line that exercises log levels besides INFO + this.baseCmdLine = String.format("java -cp %s %s -T TestPrintVariantsWalker -R %s -V %s -L 1:1000000-2000000 --no_cmdline_in_header", + StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar), + CommandLineGATK.class.getCanonicalName(), BaseTest.b37KGReference, BaseTest.b37_NA12878_OMNI); + + this.logLevel = logLevel; + this.logFileStr = explicitLogfile ? " -log " + BaseTest.createTempFile(logLevel.toString(), "log") : ""; + this.argumentOutputFile = BaseTest.createTempFile(logLevel.toString(), "vcf"); + this.pipedOutputFile = BaseTest.createTempFile(logLevel.toString(), "vcf"); + } + + public final String getCmdLine(boolean redirectStdout) { + String command = String.format("%s -l %s %s", baseCmdLine, logLevel, logFileStr); + return redirectStdout ? command : command + " -o " + argumentOutputFile; + } + + public String toString() { + return String.format("LoggingTestProvider logLevel=%s", logLevel); + } + } + + @DataProvider(name = "LoggingTest") + public Object[][] makeLoggingTestProvider() throws IOException { + for (Boolean explicitLogFile : Arrays.asList(true, false)) { + // TODO: enable other logging levels when tests for those exist + new LoggingTestProvider(Level.DEBUG, explicitLogFile); + } + + return LoggingTestProvider.getTests(LoggingTestProvider.class); + } + + /** + * test that using an output argument produces the same output as stdout + */ + @Test(dataProvider = "LoggingTest") + public void testStdoutEquivalence(final LoggingTestProvider cfg) throws IOException { + + ProcessController pc = ProcessController.getThreadLocal(); + + // output argument + + ProcessSettings ps = new ProcessSettings(cfg.getCmdLine(false).split("\\s+")); + pc.execAndCheck(ps); + String output_argument_md5 = md5db.calculateFileMD5(cfg.argumentOutputFile); + + // pipe to stdout + + ps = new ProcessSettings(cfg.getCmdLine(true).split("\\s+")); + ps.setStdoutSettings(new OutputStreamSettings(cfg.pipedOutputFile)); + pc.execAndCheck(ps); + + MD5DB.MD5Match result = md5db.testFileMD5("LoggingIntegrationTest", "LoggingIntegrationTest", cfg.pipedOutputFile, output_argument_md5, false); + if(result.failed) { + final MD5Mismatch failure = new MD5Mismatch(result.actualMD5, result.expectedMD5, result.diffEngineOutput); + Assert.fail(failure.toString()); + } + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/CryptUtilsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/CryptUtilsUnitTest.java new file mode 100644 index 000000000..beac3ace8 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/CryptUtilsUnitTest.java @@ -0,0 +1,200 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.SkipException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.security.Key; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; +import java.util.Arrays; + +public class CryptUtilsUnitTest extends BaseTest { + + @Test + public void testGenerateValidKeyPairWithDefaultSettings() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + Assert.assertTrue(CryptUtils.keysDecryptEachOther(keyPair.getPrivate(), keyPair.getPublic())); + } + + @DataProvider( name = "InvalidKeyPairSettings" ) + public Object[][] invalidKeyPairSettingsDataProvider() { + return new Object[][] { + { -1, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, "Made-up algorithm", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, "Made-up algorithm"} + }; + } + + @Test( dataProvider = "InvalidKeyPairSettings", expectedExceptions = ReviewedGATKException.class ) + public void testGenerateKeyPairWithInvalidSettings( int keyLength, String encryptionAlgorithm, String randomNumberGenerationAlgorithm ) { + KeyPair keyPair = CryptUtils.generateKeyPair(keyLength, encryptionAlgorithm, randomNumberGenerationAlgorithm); + } + + @Test + public void testGATKMasterKeyPairMutualDecryption() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testGATKMasterKeyPairMutualDecryption")); + } + + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKMasterPublicKey())); + } + + @Test + public void testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption")); + } + + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKDistributedPublicKey())); + } + + @Test + public void testKeyPairWriteThenRead() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + File privateKeyFile = createTempFile("testKeyPairWriteThenRead_private", "key"); + File publicKeyFile = createTempFile("testKeyPairWriteThenRead_public", "key"); + + CryptUtils.writeKeyPair(keyPair, privateKeyFile, publicKeyFile); + + assertKeysAreEqual(keyPair.getPrivate(), CryptUtils.readPrivateKey(privateKeyFile)); + assertKeysAreEqual(keyPair.getPublic(), CryptUtils.readPublicKey(publicKeyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromFile", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(keyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromStream", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(new FileInputStream(keyFile))); + } + + @Test + public void testPrivateKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromFile", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(keyFile)); + } + + @Test + public void testPrivateKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromStream", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(new FileInputStream(keyFile))); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPublicKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPublicKey(nonExistentFile); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPrivateKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPrivateKey(nonExistentFile); + } + + @Test + public void testDecodePublicKey() { + PublicKey originalKey = CryptUtils.generateKeyPair().getPublic(); + PublicKey decodedKey = CryptUtils.decodePublicKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testDecodePrivateKey() { + PrivateKey originalKey = CryptUtils.generateKeyPair().getPrivate(); + PrivateKey decodedKey = CryptUtils.decodePrivateKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testLoadGATKMasterPrivateKey() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testLoadGATKMasterPrivateKey")); + } + + PrivateKey gatkMasterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + } + + @Test + public void testLoadGATKMasterPublicKey() { + PublicKey gatkMasterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + } + + @Test + public void testLoadGATKDistributedPublicKey() { + PublicKey gatkDistributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + } + + private void assertKeysAreEqual( Key originalKey, Key keyFromDisk ) { + Assert.assertTrue(Arrays.equals(originalKey.getEncoded(), keyFromDisk.getEncoded())); + Assert.assertEquals(originalKey.getAlgorithm(), keyFromDisk.getAlgorithm()); + Assert.assertEquals(originalKey.getFormat(), keyFromDisk.getFormat()); + } + + private boolean gatkPrivateKeyExistsButReadPermissionDenied() { + File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); + return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java new file mode 100644 index 000000000..350ba7b75 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyIntegrationTest.java @@ -0,0 +1,157 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class GATKKeyIntegrationTest extends WalkerTest { + + public static final String BASE_COMMAND = String.format("-T TestPrintReadsWalker -R %s -I %s -o %%s", + publicTestDir + "exampleFASTA.fasta", + publicTestDir + "exampleBAM.bam"); + public static final String MD5_UPON_SUCCESSFUL_RUN = "e7b4a5b62f9d4badef1cd07040011b2b"; + + + private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { + String command = BASE_COMMAND + String.format(" %s %s", etArg, keyArg); + + WalkerTestSpec spec = expectedException != null ? + new WalkerTestSpec(command, 1, expectedException) : + new WalkerTestSpec(command, 1, Arrays.asList(md5)); + + spec.disableImplicitArgs(); // Turn off automatic inclusion of -et/-K args by WalkerTest + executeTest(testName, spec); + } + + @Test + public void testValidKeyNoET() { + runGATKKeyTest("testValidKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStdout() { + runGATKKeyTest("testValidKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStandard() { + runGATKKeyTest("testValidKeyETStandard", + "", + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testNoKeyNoET() { + runGATKKeyTest("testNoKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStdout() { + runGATKKeyTest("testNoKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStandard() { + runGATKKeyTest("testNoKeyETStandard", + "", + "", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testRevokedKey() { + runGATKKeyTest("testRevokedKey", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "revoked.key", + UserException.KeySignatureVerificationException.class, + null); + } + + @DataProvider(name = "CorruptKeyTestData") + public Object[][] corruptKeyDataProvider() { + return new Object[][] { + { "corrupt_empty.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_file.key", UserException.UnreadableKeyException.class }, + { "corrupt_random_contents.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_isize_field.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_crc.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_email_address.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_sectional_delimiter.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_signature.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_signature.key", UserException.KeySignatureVerificationException.class }, + { "corrupt_non_gzipped_valid_key.key", UserException.UnreadableKeyException.class } + }; + } + + @Test(dataProvider = "CorruptKeyTestData") + public void testCorruptKey ( String corruptKeyName, Class expectedException ) { + runGATKKeyTest(String.format("testCorruptKey (%s)", corruptKeyName), + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + corruptKeyName, + expectedException, + null); + } + + @Test + public void testCorruptButNonRequiredKey() { + runGATKKeyTest("testCorruptButNonRequiredKey", + "", + "-K " + keysDataLocation + "corrupt_random_contents.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyUnitTest.java new file mode 100644 index 000000000..89ef0b26e --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/crypt/GATKKeyUnitTest.java @@ -0,0 +1,129 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.crypt; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.SkipException; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; + +public class GATKKeyUnitTest extends BaseTest { + + @Test + public void testCreateGATKKeyUsingMasterKeyPair() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testCreateGATKKeyUsingMasterKeyPair")); + } + + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey masterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + + // We should be able to create a valid GATKKey using our master key pair: + GATKKey key = new GATKKey(masterPrivateKey, masterPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test + public void testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey() { + if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { + throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", + "testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey")); + } + + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey distributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + + // We should also be able to create a valid GATKKey using our master private + // key and the public key we distribute with the GATK: + GATKKey key = new GATKKey(masterPrivateKey, distributedPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testKeyPairMismatch() { + KeyPair firstKeyPair = CryptUtils.generateKeyPair(); + KeyPair secondKeyPair = CryptUtils.generateKeyPair(); + + // Attempting to create a GATK Key with private and public keys that aren't part of the + // same key pair should immediately trigger a validation failure: + GATKKey key = new GATKKey(firstKeyPair.getPrivate(), secondKeyPair.getPublic(), "foo@bar.com"); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testEncryptionAlgorithmMismatch() { + KeyPair keyPair = CryptUtils.generateKeyPair(CryptUtils.DEFAULT_KEY_LENGTH, "DSA", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + + // Attempting to use a DSA private key to create an RSA signature should throw an error: + GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), "foo@bar.com", "SHA1withRSA"); + } + + @Test( expectedExceptions = UserException.class ) + public void testInvalidEmailAddress() { + String emailAddressWithNulByte = new String(new byte[] { 0 }); + KeyPair keyPair = CryptUtils.generateKeyPair(); + + // Email addresses cannot contain the NUL byte, since it's used as a sectional delimiter in the key file: + GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), emailAddressWithNulByte); + } + + @Test + public void testCreateGATKKeyFromValidKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "valid.key")); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = UserException.UnreadableKeyException.class ) + public void testCreateGATKKeyFromCorruptKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "corrupt_random_contents.key")); + } + + @Test + public void testCreateGATKKeyFromRevokedKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "revoked.key")); + Assert.assertFalse(key.isValid()); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testCreateGATKKeyFromNonExistentFile() { + File nonExistentFile = new File("ghfdkgsdhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), nonExistentFile); + } + + private boolean gatkPrivateKeyExistsButReadPermissionDenied() { + File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); + return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java new file mode 100644 index 000000000..99d7559c4 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + + +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; + +import java.util.List; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 2:34:46 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the view of all loci. + */ +public class AllLocusViewUnitTest extends LocusViewTemplate { + + @Override + protected LocusView createView(LocusShardDataProvider provider) { + return new AllLocusView(provider); + } + + /** + * Test the reads according to an independently derived context. + * @param view + * @param range + * @param reads + */ + @Override + protected void testReadsInContext( LocusView view, List range, List reads ) { + AllLocusView allLocusView = (AllLocusView)view; + + // TODO: Should skip over loci not in the given range. + GenomeLoc firstLoc = range.get(0); + GenomeLoc lastLoc = range.get(range.size()-1); + GenomeLoc bounds = genomeLocParser.createGenomeLoc(firstLoc.getContig(),firstLoc.getStart(),lastLoc.getStop()); + + for( int i = bounds.getStart(); i <= bounds.getStop(); i++ ) { + GenomeLoc site = genomeLocParser.createGenomeLoc("chr1",i); + AlignmentContext locusContext = allLocusView.next(); + Assert.assertEquals(locusContext.getLocation(), site, "Locus context location is incorrect"); + int expectedReadsAtSite = 0; + + for( GATKSAMRecord read: reads ) { + if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) { + Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); + expectedReadsAtSite++; + } + } + + Assert.assertEquals(locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); + } + + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java new file mode 100644 index 000000000..6665b7481 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java @@ -0,0 +1,102 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + + +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; + +import java.util.List; +/** + * User: hanna + * Date: May 12, 2009 + * Time: 2:34:46 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the CoveredLocusView. + */ +public class CoveredLocusViewUnitTest extends LocusViewTemplate { + + /** + * Retrieve a covered locus view. + */ + @Override + protected LocusView createView(LocusShardDataProvider provider) { + return new CoveredLocusView(provider); + } + + /** + * Test the reads according to an independently derived context. + * @param view + * @param range + * @param reads + */ + @Override + protected void testReadsInContext( LocusView view, List range, List reads ) { + CoveredLocusView coveredLocusView = (CoveredLocusView)view; + + // TODO: Should skip over loci not in the given range. + GenomeLoc firstLoc = range.get(0); + GenomeLoc lastLoc = range.get(range.size()-1); + GenomeLoc bounds = genomeLocParser.createGenomeLoc(firstLoc.getContig(),firstLoc.getStart(),lastLoc.getStop()); + + for( int i = bounds.getStart(); i <= bounds.getStop(); i++ ) { + GenomeLoc site = genomeLocParser.createGenomeLoc("chr1",i); + + int expectedReadsAtSite = 0; + for( GATKSAMRecord read: reads ) { + if( genomeLocParser.createGenomeLoc(read).containsP(site) ) + expectedReadsAtSite++; + } + + if( expectedReadsAtSite < 1 ) + continue; + + Assert.assertTrue(coveredLocusView.hasNext(),"Incorrect number of loci in view"); + + AlignmentContext locusContext = coveredLocusView.next(); + Assert.assertEquals(locusContext.getLocation(), site, "Target locus context location is incorrect"); + Assert.assertEquals(locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); + + for( GATKSAMRecord read: reads ) { + if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) + Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); + } + } + + Assert.assertFalse(coveredLocusView.hasNext(),"Iterator is not bounded at boundaries of shard"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java new file mode 100644 index 000000000..f77b6613a --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java @@ -0,0 +1,366 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.util.PeekableIterator; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.tribble.BasicFeature; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.refdata.RODRecordListImpl; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * @author depristo + */ +public class IntervalReferenceOrderedViewUnitTest extends BaseTest { + private static int startingChr = 1; + private static int endingChr = 2; + private static int readCount = 100; + private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; + private static String contig; + private static SAMFileHeader header; + + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + contig = header.getSequence(0).getSequenceName(); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + + initializeTests(); + } + + private class CompareFeatures implements Comparator { + @Override + public int compare(Feature o1, Feature o2) { + return genomeLocParser.createGenomeLoc(o1).compareTo(genomeLocParser.createGenomeLoc(o2)); + } + } + + private class ReadMetaDataTrackerRODStreamTest extends TestDataProvider { + final List allFeatures; + final List intervals; + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final GenomeLoc interval) { + this(allFeatures, Collections.singletonList(interval)); + } + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final List intervals) { + super(ReadMetaDataTrackerRODStreamTest.class); + this.allFeatures = new ArrayList(allFeatures); + Collections.sort(this.allFeatures, new CompareFeatures()); + this.intervals = new ArrayList(intervals); + Collections.sort(this.intervals); + setName(String.format("%s nFeatures %d intervals %s", getClass().getSimpleName(), allFeatures.size(), + intervals.size() == 1 ? intervals.get(0) : "size " + intervals.size())); + } + + public PeekableIterator getIterator(final String name) { + return new PeekableIterator(new TribbleIteratorFromCollection(name, genomeLocParser, allFeatures)); + } + + public Set getExpectedOverlaps(final GenomeLoc interval) { + final Set overlapping = new HashSet(); + for ( final Feature f : allFeatures ) + if ( genomeLocParser.createGenomeLoc(f).overlapsP(interval) ) + overlapping.add(f); + return overlapping; + } + } + + public void initializeTests() { + final List handPickedFeatures = new ArrayList(); + + handPickedFeatures.add(new BasicFeature(contig, 1, 1)); + handPickedFeatures.add(new BasicFeature(contig, 2, 5)); + handPickedFeatures.add(new BasicFeature(contig, 4, 4)); + handPickedFeatures.add(new BasicFeature(contig, 6, 6)); + handPickedFeatures.add(new BasicFeature(contig, 9, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 11)); + handPickedFeatures.add(new BasicFeature(contig, 13, 20)); + + createTestsForFeatures(handPickedFeatures); + + // test in the present of a large spanning element + { + List oneLargeSpan = new ArrayList(handPickedFeatures); + oneLargeSpan.add(new BasicFeature(contig, 1, 30)); + createTestsForFeatures(oneLargeSpan); + } + + // test in the presence of a partially spanning element + { + List partialSpanStart = new ArrayList(handPickedFeatures); + partialSpanStart.add(new BasicFeature(contig, 1, 6)); + createTestsForFeatures(partialSpanStart); + } + + // test in the presence of a partially spanning element at the end + { + List partialSpanEnd = new ArrayList(handPickedFeatures); + partialSpanEnd.add(new BasicFeature(contig, 10, 30)); + createTestsForFeatures(partialSpanEnd); + } + + // no data at all + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, 5, 5); + new ReadMetaDataTrackerRODStreamTest(Collections.emptyList(), loc); + } + + // -------------------------------------------------------------------------------- + // + // tests for the lower level IntervalOverlappingRODsFromStream + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerRODStreamTest") + public Object[][] createReadMetaDataTrackerRODStreamTest() { + return ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + } + + private GenomeLoc span(final List features) { + int featuresStart = 1; for ( final GenomeLoc f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final GenomeLoc f : features ) featuresStop = Math.max(featuresStop, f.getStop()); + return genomeLocParser.createGenomeLoc(contig, featuresStart, featuresStop); + } + + private void createTestsForFeatures(final List features) { + int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); + + for ( final int size : Arrays.asList(1, 5, 10, 100) ) { + final List allIntervals = new ArrayList(); + // regularly spaced + for ( int start = featuresStart; start < featuresStop; start++) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); + allIntervals.add(loc); + new ReadMetaDataTrackerRODStreamTest(features, loc); + } + + // starting and stopping at every feature + for ( final Feature f : features ) { + // just at the feature + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart(), f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // up to end + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // missing by 1 + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() + 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // just spanning + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + } + + new ReadMetaDataTrackerRODStreamTest(features, allIntervals); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest") + public void runReadMetaDataTrackerRODStreamTest_singleQuery(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() == 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, Collections.singletonList(data.intervals.get(0))); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_singleQuery") + public void runReadMetaDataTrackerRODStreamTest_multipleQueries(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() > 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, data.intervals); + } + } + + private void testRODStream(final ReadMetaDataTrackerRODStreamTest test, final IntervalOverlappingRODsFromStream stream, final List intervals) { + for ( final GenomeLoc interval : intervals ) { + final RODRecordList query = stream.getOverlapping(interval); + final HashSet queryFeatures = new HashSet(); + for ( final GATKFeature f : query ) queryFeatures.add((Feature)f.getUnderlyingObject()); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + + // -------------------------------------------------------------------------------- + // + // tests for the higher level tracker itself + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerTests") + public Object[][] createTrackerTests() { + List tests = new ArrayList(); + + final Object[][] singleTests = ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + final List multiSiteTests = new ArrayList(); + for ( final Object[] singleTest : singleTests ) { + if ( ((ReadMetaDataTrackerRODStreamTest)singleTest[0]).intervals.size() > 1 ) + multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); + } + + for ( final boolean testStateless : Arrays.asList(true, false) ) { + // all pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { + tests.add(new Object[]{singleTest, testStateless}); + } + + // all 3 way pairwise tests + //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + // tests.add(new Object[]{singleTest, testStateless}); + //} + } + + logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") + public void runReadMetaDataTrackerTest(final List RODs, final boolean testStateless) { + final List names = new ArrayList(); + final List> iterators = new ArrayList>(); + final List intervals = new ArrayList(); + final List> rodBindings = new ArrayList>(); + + for ( int i = 0; i < RODs.size(); i++ ) { + final RodBinding rodBinding = new RodBinding(Feature.class, "name"+i); + rodBindings.add(rodBinding); + final String name = rodBinding.getName(); + names.add(name); + iterators.add(RODs.get(i).getIterator(name)); + intervals.addAll(RODs.get(i).intervals); + } + + Collections.sort(intervals); + final GenomeLoc span = span(intervals); + final IntervalReferenceOrderedView view = new IntervalReferenceOrderedView(genomeLocParser, span, names, iterators); + + if ( testStateless ) { + // test each tracker is well formed, as each is created + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + testMetaDataTrackerBindings(tracker, interval, RODs, rodBindings); + } + } else { + // tests all trackers are correct after reading them into an array + // this checks that the trackers are be safely stored away and analyzed later (critical for nano-scheduling) + final List trackers = new ArrayList(); + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + trackers.add(tracker); + } + + for ( int i = 0; i < trackers.size(); i++) { + testMetaDataTrackerBindings(trackers.get(i), intervals.get(i), RODs, rodBindings); + } + } + } + + private void testMetaDataTrackerBindings(final RefMetaDataTracker tracker, + final GenomeLoc interval, + final List RODs, + final List> rodBindings) { + for ( int i = 0; i < RODs.size(); i++ ) { + final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); + final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); + final Set queryFeatures = new HashSet(queryFeaturesList); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + + static class TribbleIteratorFromCollection implements Iterator { + // current location + private final String name; + final Queue gatkFeatures; + + public TribbleIteratorFromCollection(final String name, final GenomeLocParser genomeLocParser, final List features) { + this.name = name; + + this.gatkFeatures = new LinkedList(); + for ( final Feature f : features ) + gatkFeatures.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); + } + + @Override + public boolean hasNext() { + return ! gatkFeatures.isEmpty(); + } + + @Override + public RODRecordList next() { + final GATKFeature first = gatkFeatures.poll(); + final Collection myFeatures = new LinkedList(); + myFeatures.add(first); + while ( gatkFeatures.peek() != null && gatkFeatures.peek().getLocation().getStart() == first.getStart() ) + myFeatures.add(gatkFeatures.poll()); + + GenomeLoc loc = first.getLocation(); + for ( final GATKFeature feature : myFeatures ) + loc = loc.merge(feature.getLocation()); + + return new RODRecordListImpl(name, myFeatures, loc); // is this safe? + } + + @Override public void remove() { throw new IllegalStateException("GRRR"); } + } +} + + diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java new file mode 100644 index 000000000..3f620f900 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java @@ -0,0 +1,143 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.testng.Assert; +import org.testng.annotations.Test; + +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; +import org.broadinstitute.gatk.engine.iterators.GenomeLocusIterator; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.StringUtil; + +import java.util.Collections; +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** Tests for viewing the reference from the perspective of a locus. */ + +public class LocusReferenceViewUnitTest extends ReferenceViewTemplate { + +// +// /** Multiple-base pair queries should generate exceptions. */ +// @Test(expectedExceptions=InvalidPositionException.class) +// public void testSingleBPFailure() { +// Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc(0, 1, 50)); +// +// ShardDataProvider dataProvider = new ShardDataProvider(shard, null, sequenceFile, null); +// LocusReferenceView view = new LocusReferenceView(dataProvider); +// +// view.getReferenceContext(shard.getGenomeLoc()).getBase(); +// } + + @Test + public void testOverlappingReferenceBases() { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), + sequenceFile.getSequence("chrM").length() - 10, + sequenceFile.getSequence("chrM").length()))); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, sequenceFile, null); + LocusReferenceView view = new LocusReferenceView(dataProvider); + + byte[] results = view.getReferenceBases(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), + sequenceFile.getSequence("chrM").length() - 10, + sequenceFile.getSequence("chrM").length() + 9)); + System.out.printf("results are %s%n", new String(results)); + Assert.assertEquals(results.length, 20); + for (int x = 0; x < results.length; x++) { + if (x <= 10) Assert.assertTrue(results[x] != 'X'); + else Assert.assertTrue(results[x] == 'X'); + } + } + + + /** Queries outside the bounds of the shard should result in reference context window trimmed at the shard boundary. */ + @Test + public void testBoundsFailure() { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), 1, 50))); + + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, sequenceFile, null); + LocusReferenceView view = new LocusReferenceView(dataProvider); + + GenomeLoc locus = genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), 50, 51); + + ReferenceContext rc = view.getReferenceContext(locus); + Assert.assertTrue(rc.getLocus().equals(locus)); + Assert.assertTrue(rc.getWindow().equals(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(),50))); + Assert.assertTrue(rc.getBases().length == 1); + } + + + /** + * Compares the contents of the fasta and view at a specified location. + * + * @param loc + */ + protected void validateLocation( GenomeLoc loc ) { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(loc)); + GenomeLocusIterator shardIterator = new GenomeLocusIterator(genomeLocParser,loc); + + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, loc, null, sequenceFile, null); + LocusReferenceView view = new LocusReferenceView(dataProvider); + + while (shardIterator.hasNext()) { + GenomeLoc locus = shardIterator.next(); + + ReferenceSequence expectedAsSeq = sequenceFile.getSubsequenceAt(locus.getContig(), locus.getStart(), locus.getStop()); + char expected = Character.toUpperCase(StringUtil.bytesToString(expectedAsSeq.getBases()).charAt(0)); + char actual = view.getReferenceContext(locus).getBaseAsChar(); + + Assert.assertEquals(actual, expected, String.format("Value of base at position %s in shard %s does not match expected", locus.toString(), shard.getGenomeLocs()) + ); + } + } + +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java new file mode 100644 index 000000000..8b476e6ef --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java @@ -0,0 +1,405 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.engine.executive.WindowMaker; +import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; +/** + * User: hanna + * Date: May 13, 2009 + * Time: 4:29:08 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** Base support for testing variants of the LocusView family of classes. */ + +public abstract class LocusViewTemplate extends BaseTest { + protected static ReferenceSequenceFile sequenceSourceFile = null; + protected GenomeLocParser genomeLocParser = null; + + @BeforeClass + public void setupGenomeLoc() throws FileNotFoundException { + sequenceSourceFile = fakeReferenceSequenceFile(); + genomeLocParser = new GenomeLocParser(sequenceSourceFile); + } + + @Test + public void emptyAlignmentContextTest() { + SAMRecordIterator iterator = new SAMRecordIterator(); + + GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); + Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); + + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.emptyList()); + } + + @Test + public void singleReadTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(shardBounds)); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readCoveringFirstPartTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readCoveringLastPartTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readCoveringMiddleTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 3, 7); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readAndLocusOverlapAtLastBase() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readOverlappingStartTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readOverlappingEndTest() { + GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 15); + SAMRecordIterator iterator = new SAMRecordIterator(read); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); + } + + @Test + public void readsSpanningTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void duplicateReadsTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 1, 5); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 6, 10); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2, read3, read4); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void cascadingReadsWithinBoundsTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 2, 6); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 3, 7); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 4, 8); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 5, 9); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2, read3, read4); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void cascadingReadsAtBoundsTest() { + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 2, 6); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 3, 7); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 4, 8); + GATKSAMRecord read5 = buildSAMRecord("read5","chr1", 5, 9); + GATKSAMRecord read6 = buildSAMRecord("read6","chr1", 6, 10); + SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4, read5, read6); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read1, read2, read3, read4, read5, read6); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + @Test + public void cascadingReadsOverlappingBoundsTest() { + GATKSAMRecord read01 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read02 = buildSAMRecord("read2","chr1", 2, 6); + GATKSAMRecord read03 = buildSAMRecord("read3","chr1", 3, 7); + GATKSAMRecord read04 = buildSAMRecord("read4","chr1", 4, 8); + GATKSAMRecord read05 = buildSAMRecord("read5","chr1", 5, 9); + GATKSAMRecord read06 = buildSAMRecord("read6","chr1", 6, 10); + GATKSAMRecord read07 = buildSAMRecord("read7","chr1", 7, 11); + GATKSAMRecord read08 = buildSAMRecord("read8","chr1", 8, 12); + GATKSAMRecord read09 = buildSAMRecord("read9","chr1", 9, 13); + GATKSAMRecord read10 = buildSAMRecord("read10","chr1", 10, 14); + GATKSAMRecord read11 = buildSAMRecord("read11","chr1", 11, 15); + GATKSAMRecord read12 = buildSAMRecord("read12","chr1", 12, 16); + SAMRecordIterator iterator = new SAMRecordIterator(read01, read02, read03, read04, read05, read06, + read07, read08, read09, read10, read11, read12); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); + WindowMaker.WindowMakerIterator window = windowMaker.next(); + LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); + LocusView view = createView(dataProvider); + + List expectedReads = new ArrayList(); + Collections.addAll(expectedReads, read01, read02, read03, read04, read05, read06, + read07, read08, read09, read10, read11, read12); + testReadsInContext(view, shard.getGenomeLocs(), expectedReads); + } + + /** + * Creates a view of the type required for testing. + * + * @return The correct view to test. + */ + protected abstract LocusView createView(LocusShardDataProvider provider); + + /** + * Test the reads according to an independently derived context. + * + * @param view + * @param bounds + * @param reads + */ + protected abstract void testReadsInContext(LocusView view, List bounds, List reads); + + /** + * Fake a reference sequence file. Essentially, seek a header with a bunch of dummy data. + * + * @return A 'fake' reference sequence file + */ + private static ReferenceSequenceFile fakeReferenceSequenceFile() { + return new ReferenceSequenceFile() { + public SAMSequenceDictionary getSequenceDictionary() { + SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1", 1000000); + SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Collections.singletonList(sequenceRecord)); + return dictionary; + } + + public boolean isIndexed() { return false; } + + public ReferenceSequence nextSequence() { + throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); + } + + public ReferenceSequence getSequence( String contig ) { + throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); + } + + public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { + throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); + } + + public void reset() { + return; + } + + public void close() throws IOException { + } + }; + } + + /** + * Build a SAM record featuring the absolute minimum required dataset. + * + * @param contig Contig to populate. + * @param alignmentStart start of alignment + * @param alignmentEnd end of alignment + * + * @return New SAM Record + */ + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { + SAMFileHeader header = new SAMFileHeader(); + header.setSequenceDictionary(sequenceSourceFile.getSequenceDictionary()); + + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(sequenceSourceFile.getSequenceDictionary().getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + Cigar cigar = new Cigar(); + int len = alignmentEnd - alignmentStart + 1; + cigar.add(new CigarElement(len, CigarOperator.M)); + record.setCigar(cigar); + record.setReadBases(new byte[len]); + record.setBaseQualities(new byte[len]); + return record; + } + + /** A simple iterator which iterates over a list of reads. */ + protected class SAMRecordIterator implements GATKSAMIterator { + private Iterator backingIterator = null; + + public SAMRecordIterator(SAMRecord... reads) { + List backingList = new ArrayList(); + backingList.addAll(Arrays.asList(reads)); + backingIterator = backingList.iterator(); + } + + public boolean hasNext() { + return backingIterator.hasNext(); + } + + public SAMRecord next() { + return backingIterator.next(); + } + + public Iterator iterator() { + return this; + } + + public void close() { + // NO-OP. + } + + public void remove() { + throw new UnsupportedOperationException("Can't remove from a read-only iterator"); + } + } + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceViewUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java new file mode 100644 index 000000000..dbc2f5518 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -0,0 +1,157 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.providers; + +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.datasources.reads.Shard; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.codecs.table.TableFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet.RMDStorageType; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Arrays; +import java.util.Collections; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +/** + * User: hanna + * Date: May 27, 2009 + * Time: 3:07:23 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the transparent view into the reference-ordered data. At the moment, just do some basic bindings and make + * sure the data comes through correctly. + */ +public class ReferenceOrderedViewUnitTest extends BaseTest { + /** + * Sequence file. + */ + private static IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * our track builder + */ + RMDTrackBuilder builder = null; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); + } + + /** + * Make sure binding to an empty list produces an empty tracker. + */ + @Test + public void testNoBindings() { + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); + LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.emptyList()); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); + + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10)); + Assert.assertEquals(tracker.getValues(Feature.class).size(), 0, "The tracker should not have produced any data"); + } + + /** + * Test a single ROD binding. + */ + @Test + public void testSingleBinding() { + String fileName = privateTestDir + "TabularDataTest.dat"; + RMDTriplet triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); + ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); + + LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.singletonList(dataSource)); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); + + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); + TableFeature datum = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest")); + + Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); + Assert.assertEquals(datum.get("COL2"),"D","datum parameter for COL2 is incorrect"); + Assert.assertEquals(datum.get("COL3"),"E","datum parameter for COL3 is incorrect"); + } + + /** + * Make sure multiple bindings are visible from the view. + */ + @Test + public void testMultipleBinding() { + File file = new File(privateTestDir + "TabularDataTest.dat"); + + RMDTriplet testTriplet1 = new RMDTriplet("tableTest1","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); + ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(testTriplet1,builder,seq.getSequenceDictionary(),genomeLocParser,false); + + RMDTriplet testTriplet2 = new RMDTriplet("tableTest2","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); + ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(testTriplet2,builder,seq.getSequenceDictionary(),genomeLocParser,false); + + Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); + + LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Arrays.asList(dataSource1,dataSource2)); + ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); + + RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); + TableFeature datum1 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest1")); + + Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); + Assert.assertEquals(datum1.get("COL2"),"D","datum1 parameter for COL2 is incorrect"); + Assert.assertEquals(datum1.get("COL3"),"E","datum1 parameter for COL3 is incorrect"); + + TableFeature datum2 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest2")); + + Assert.assertEquals(datum2.get("COL1"),"C","datum2 parameter for COL1 is incorrect"); + Assert.assertEquals(datum2.get("COL2"),"D","datum2 parameter for COL2 is incorrect"); + Assert.assertEquals(datum2.get("COL3"),"E","datum2 parameter for COL3 is incorrect"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceViewTemplate.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ShardDataProviderUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java new file mode 100644 index 000000000..258e61b49 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java @@ -0,0 +1,103 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileSpan; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.FileNotFoundException; +import java.util.*; + +public class ActiveRegionShardBalancerUnitTest extends BaseTest { + // example genome loc parser for this test, can be deleted if you don't use the reference + private GenomeLocParser genomeLocParser; + protected SAMDataSource readsDataSource; + + @BeforeClass + public void setup() throws FileNotFoundException { + // sequence + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 10000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + readsDataSource = null; + } + + @Test + public void testMergingManyContigs() { + executeTest(genomeLocParser.getContigs().getSequences()); + } + + @Test + public void testMergingAllPointersOnSingleContig() { + executeTest(Arrays.asList(genomeLocParser.getContigs().getSequences().get(1))); + } + + @Test + public void testMergingMultipleDiscontinuousContigs() { + final List all = genomeLocParser.getContigs().getSequences(); + executeTest(Arrays.asList(all.get(1), all.get(3))); + } + + private void executeTest(final Collection records) { + final ActiveRegionShardBalancer balancer = new ActiveRegionShardBalancer(); + + final List> expectedLocs = new LinkedList<>(); + final List pointers = new LinkedList<>(); + + for ( final SAMSequenceRecord record : records ) { + final int size = 10; + int end = 0; + for ( int i = 0; i < record.getSequenceLength(); i += size) { + final int myEnd = i + size - 1; + end = myEnd; + final GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getSequenceName(), i, myEnd); + final Map fileSpans = Collections.emptyMap(); + final FilePointer fp = new FilePointer(fileSpans, IntervalMergingRule.ALL, Collections.singletonList(loc)); + pointers.add(fp); + } + expectedLocs.add(Collections.singleton(genomeLocParser.createGenomeLoc(record.getSequenceName(), 0, end))); + } + + balancer.initialize(readsDataSource, pointers.iterator(), genomeLocParser); + + int i = 0; + int nShardsFound = 0; + for ( final Shard shard : balancer ) { + nShardsFound++; + Assert.assertEquals(new HashSet<>(shard.getGenomeLocs()), expectedLocs.get(i++)); + } + Assert.assertEquals(nShardsFound, records.size(), "Didn't find exactly one shard for each contig in the sequence dictionary"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java new file mode 100644 index 000000000..7df9bc2cb --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java @@ -0,0 +1,94 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import com.google.caliper.Param; +import org.broadinstitute.gatk.engine.WalkerManager; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.walkers.LocusWalker; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Apr 22, 2011 + * Time: 4:02:56 PM + * To change this template use File | Settings | File Templates. + */ +public class DownsamplerBenchmark extends ReadProcessingBenchmark { + @Param + private String bamFile; + + @Param + private Integer maxReads; + + @Override + public String getBAMFile() { return bamFile; } + + @Override + public Integer getMaxReads() { return maxReads; } + + @Param + private Downsampling downsampling; + +// public void timeDownsampling(int reps) { +// for(int i = 0; i < reps; i++) { +// SAMFileReader reader = new SAMFileReader(inputFile); +// ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), +// reader.getFileHeader(), +// SAMFileHeader.SortOrder.coordinate, +// false, +// SAMFileReader.ValidationStringency.SILENT, +// downsampling.create(), +// new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), +// Collections.emptyList(), +// Collections.emptyList(), +// false, +// (byte)0, +// false); +// +// GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); +// // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? +// Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); +// LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); +// while(locusIteratorByState.hasNext()) { +// locusIteratorByState.next().getLocation(); +// } +// reader.close(); +// } +// } + + private enum Downsampling { + NONE { + @Override + DownsamplingMethod create() { return DownsamplingMethod.NONE; } + }, + PER_SAMPLE { + @Override + DownsamplingMethod create() { return WalkerManager.getDownsamplingMethod(LocusWalker.class); } + }; + abstract DownsamplingMethod create(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java new file mode 100644 index 000000000..a54237bfb --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java @@ -0,0 +1,130 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.GATKBAMFileSpan; +import htsjdk.samtools.GATKChunk; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +/** + * + */ +public class FilePointerUnitTest extends BaseTest { + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + private SAMReaderID readerID = new SAMReaderID("samFile",new Tags()); + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + @Test + public void testFilePointerCombineDisjoint() { + FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2))); + + FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + + //Now test that adjacent (but disjoint) intervals are properly handled with OVERLAPPING_ONLY + one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",6,10)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2))); + + result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, + genomeLocParser.createGenomeLoc("chr1",1,5), + genomeLocParser.createGenomeLoc("chr1",6,10)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + } + + @Test + public void testFilePointerCombineJoint() { + FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",2,6)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3))); + + FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,6)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + + //Repeat the tests for OVERLAPPING_ONLY + one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5)); + one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); + two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",2,6)); + two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3))); + + result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,6)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3))); + + Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); + } + + @Test + public void testFilePointerCombineOneSided() { + FilePointer filePointer = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); + filePointer.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + FilePointer empty = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10)); + // Do not add file spans to empty result + + FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10)); + result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); + Assert.assertEquals(filePointer.combine(genomeLocParser,empty),result,"Combination of two file pointers is incorrect"); + Assert.assertEquals(empty.combine(genomeLocParser,filePointer),result,"Combination of two file pointers is incorrect"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKBAMIndexUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java new file mode 100644 index 000000000..aa66d6636 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java @@ -0,0 +1,156 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import com.google.caliper.Param; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; + +import java.io.File; +import java.io.PrintStream; +import java.util.Collections; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Feb 25, 2011 + * Time: 10:16:54 AM + * To change this template use File | Settings | File Templates. + */ +public class GATKWalkerBenchmark extends ReadProcessingBenchmark { + @Param + private String bamFile; + + @Param + private Integer maxReads; + + @Param + private String referenceFile; + + @Param + private WalkerType walkerType; + + @Override + public String getBAMFile() { return bamFile; } + + @Override + public Integer getMaxReads() { return maxReads; } + + @Override + public void setUp() { + super.setUp(); + } + + public void timeWalkerPerformance(final int reps) { + for(int i = 0; i < reps; i++) { + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + + // Establish the argument collection + GATKArgumentCollection argCollection = new GATKArgumentCollection(); + argCollection.referenceFile = new File(referenceFile); + argCollection.samFiles = Collections.singletonList(inputFile.getAbsolutePath()); + + engine.setArguments(argCollection); + // Bugs in the engine mean that this has to be set twice. + engine.setSAMFileIDs(Collections.singletonList(new SAMReaderID(inputFile,new Tags()))); + engine.setFilters(Collections.singletonList(new UnmappedReadFilter())); + engine.setReferenceMetaDataFiles(Collections.emptyList()); + + // Create the walker + engine.setWalker(walkerType.create()); + + engine.execute(); + } + } + + private enum WalkerType { + COUNT_READS { + @Override + Walker create() { return new CountReadsPerformanceWalker(); } + }, + COUNT_BASES_IN_READ { + @Override + Walker create() { return new CountBasesInReadPerformanceWalker(); } + }, + COUNT_LOCI { + @Override + Walker create() { + CountLociPerformanceWalker walker = new CountLociPerformanceWalker(); + JVMUtils.setFieldValue(JVMUtils.findField(CountLociPerformanceWalker.class,"out"),walker,System.out); + return walker; + } + }; + abstract Walker create(); + } +} + +class CountLociPerformanceWalker extends TestCountLociWalker { + // NOTE: Added this output during porting. Previous version of test was reaching out of engine + // and into production o.b.g.tools.walkers.qc.CountLoci. + @Output + PrintStream out; + + @Override + public void onTraversalDone(Long result) { + out.println(result); + } +} + +class CountReadsPerformanceWalker extends TestCountReadsWalker { +} + +class CountBasesInReadPerformanceWalker extends ReadWalker { + private long As; + private long Cs; + private long Gs; + private long Ts; + + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { + for(byte base: read.getReadBases()) { + switch(base) { + case 'A': As++; break; + case 'C': Cs++; break; + case 'G': Gs++; break; + case 'T': Ts++; break; + } + } + return 1; + } + + public Long reduceInit() { return 0L; } + public Long reduce(Integer value, Long accum) { return value + accum; } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/IntervalOverlapFilteringIteratorUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java new file mode 100644 index 000000000..896549adf --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java @@ -0,0 +1,49 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.util.List; +import java.util.Collections; + +/** + * A mock locus shard, usable for infrastructure that requires a shard to behave properly. + * + * @author mhanna + * @version 0.1 + */ +public class MockLocusShard extends LocusShard { + public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { + super( genomeLocParser, + new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser), + intervals, + null); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/PicardBaselineBenchmark.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadProcessingBenchmark.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java new file mode 100644 index 000000000..3836409b9 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java @@ -0,0 +1,196 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +public class ReadShardBalancerUnitTest extends BaseTest { + + /** + * Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries + * at inappropriate places, such as within an alignment start position + */ + private static class ReadShardBalancerTest extends TestDataProvider { + private int numContigs; + private int numStacksPerContig; + private int stackSize; + private int numUnmappedReads; + private DownsamplingMethod downsamplingMethod; + private int expectedReadCount; + + private SAMFileHeader header; + private SAMReaderID testBAM; + + public ReadShardBalancerTest( int numContigs, + int numStacksPerContig, + int stackSize, + int numUnmappedReads, + int downsamplingTargetCoverage ) { + super(ReadShardBalancerTest.class); + + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.stackSize = stackSize; + this.numUnmappedReads = numUnmappedReads; + + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null); + this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; + + setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", + getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); + } + + public void run() { + createTestBAM(); + + SAMDataSource dataSource = new SAMDataSource(Arrays.asList(testBAM), + new ThreadAllocation(), + null, + new GenomeLocParser(header.getSequenceDictionary()), + false, + ValidationStringency.SILENT, + ReadShard.DEFAULT_MAX_READS, // reset ReadShard.MAX_READS to ReadShard.DEFAULT_MAX_READS for each test + downsamplingMethod, + new ValidationExclusion(), + new ArrayList(), + false); + + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + + SAMRecord readAtEndOfLastShard = null; + int totalReadsSeen = 0; + + for ( Shard shard : shardIterator ) { + int numContigsThisShard = 0; + SAMRecord lastRead = null; + + for ( SAMRecord read : shard.iterator() ) { + totalReadsSeen++; + + if ( lastRead == null ) { + numContigsThisShard = 1; + } + else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { + numContigsThisShard++; + } + + // If the last read from the previous shard is not unmapped, we have to make sure + // that no reads in this shard start at the same position + if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { + Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && + readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), + String.format("Reads from alignment start position %d:%d are split across multiple shards", + read.getReferenceIndex(), read.getAlignmentStart())); + } + + lastRead = read; + } + + // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) + Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); + + readAtEndOfLastShard = lastRead; + } + + Assert.assertEquals(totalReadsSeen, expectedReadCount, "did not encounter the expected number of reads"); + } + + private void createTestBAM() { + header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); + SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, + "foo", + numContigs, + numStacksPerContig, + stackSize, + stackSize, + 1, + 100, + 50, + 150, + numUnmappedReads); + + final File testBAMFile = createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); + + SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); + for ( SAMRecord read : artificialReads ) { + bamWriter.addAlignment(read); + } + bamWriter.close(); + + testBAM = new SAMReaderID(testBAMFile, new Tags()); + + new File(testBAM.getSamFilePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getSamFilePath() + ".bai").deleteOnExit(); + } + } + + @DataProvider(name = "ReadShardBalancerTestDataProvider") + public Object[][] createReadShardBalancerTests() { + for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { + for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { + // Use crucial read shard boundary values as the stack sizes + for ( int stackSize : Arrays.asList(ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS / 2 + 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS - 1, ReadShard.DEFAULT_MAX_READS + 1, ReadShard.DEFAULT_MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS * 2) ) { + // The first value will result in no downsampling at all, the others in some downsampling + for ( int downsamplingTargetCoverage : Arrays.asList(ReadShard.DEFAULT_MAX_READS * 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS / 2) ) { + new ReadShardBalancerTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); + } + } + } + } + } + + return ReadShardBalancerTest.getTests(ReadShardBalancerTest.class); + } + + @Test(dataProvider = "ReadShardBalancerTestDataProvider") + public void runReadShardBalancerTest( ReadShardBalancerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java new file mode 100644 index 000000000..eb1915f0c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java @@ -0,0 +1,254 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.annotations.AfterMethod; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.testng.Assert.*; + +/** + *

+ * Class SAMDataSourceUnitTest + *

+ * The test of the SAMBAM simple data source. + */ +public class SAMDataSourceUnitTest extends BaseTest { + + // TODO: These legacy tests should really be replaced with a more comprehensive suite of tests for SAMDataSource + + private List readers; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + readers = new ArrayList(); + + // sequence + seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); + genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); + } + + /** + * Tears down the test fixture after each call. + *

+ * Called after every test case method. + */ + @AfterMethod + public void undoForEachTest() { + seq = null; + readers.clear(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testLinearBreakIterateAll() { + logger.warn("Executing testLinearBreakIterateAll"); + + // setup the data + readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); + + // the sharding strat. + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + + Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); + int count = 0; + + try { + for (Shard sh : strat) { + int readCount = 0; + count++; + + GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); + logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); + logger.debug("count = " + count); + GATKSAMIterator datum = data.seek(sh); + + // for the first couple of shards make sure we can see the reads + if (count < 5) { + for (SAMRecord r : datum) { + } + readCount++; + } + datum.close(); + + // if we're over 100 shards, break out + if (count > 100) { + break; + } + } + } + catch (UserException.CouldNotReadInputFile e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); + } + } + + /** Test that we clear program records when requested */ + @Test + public void testRemoveProgramRecords() { + logger.warn("Executing testRemoveProgramRecords"); + + // setup the data + readers.add(new SAMReaderID(new File(b37GoodBAM),new Tags())); + + // use defaults + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + + List defaultProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(defaultProgramRecords.size() != 0, "testRemoveProgramRecords: No program records found when using default constructor"); + + boolean removeProgramRecords = false; + data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + removeProgramRecords, + false, + null, IntervalMergingRule.ALL); + + List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); + + removeProgramRecords = true; + data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + removeProgramRecords, + false, + null, IntervalMergingRule.ALL); + + List doRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); + } + + @Test(expectedExceptions = UserException.class) + public void testFailOnReducedReads() { + readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); + + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + } + + @Test(expectedExceptions = UserException.class) + public void testFailOnReducedReadsRemovingProgramRecords() { + readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); + + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + Collections.emptyList(), + false, + (byte) -1, + true, + false, + null, IntervalMergingRule.ALL); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java new file mode 100644 index 000000000..c975fb166 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java @@ -0,0 +1,50 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reads; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; + +public class SAMReaderIDUnitTest extends BaseTest { + + @Test + public void testSAMReaderIDHashingAndEquality() { + // Test to make sure that two SAMReaderIDs that point at the same file via an absolute vs. relative + // path are equal according to equals() and have the same hash code + final File relativePathToBAMFile = new File(publicTestDir + "exampleBAM.bam"); + final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); + final SAMReaderID relativePathSAMReaderID = new SAMReaderID(relativePathToBAMFile, new Tags()); + final SAMReaderID absolutePathSAMReaderID = new SAMReaderID(absolutePathToBAMFile, new Tags()); + + Assert.assertEquals(relativePathSAMReaderID, absolutePathSAMReaderID, "Absolute-path and relative-path SAMReaderIDs not equal according to equals()"); + Assert.assertEquals(relativePathSAMReaderID.hashCode(), absolutePathSAMReaderID.hashCode(), "Absolute-path and relative-path SAMReaderIDs have different hash codes"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SeekableBufferedStreamUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SeekableBufferedStreamUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SeekableBufferedStreamUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SeekableBufferedStreamUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/TheoreticalMinimaBenchmark.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java new file mode 100644 index 000000000..a544d716a --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java @@ -0,0 +1,75 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.reference; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.IOException; + +public class ReferenceDataSourceIntegrationTest extends WalkerTest { + + @Test + public void testReferenceWithMissingFaiFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File dictFile = new File(dummyReference.getAbsolutePath().replace(".fasta", ".dict")); + dictFile.deleteOnExit(); + Assert.assertTrue(dictFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T TestPrintReadsWalker" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceFaiFile.class + ); + + executeTest("testReferenceWithMissingFaiFile", spec); + } + + @Test + public void testReferenceWithMissingDictFile() throws IOException { + final File dummyReference = createTempFile("dummy", ".fasta"); + final File faiFile = new File(dummyReference.getAbsolutePath() + ".fai"); + faiFile.deleteOnExit(); + Assert.assertTrue(faiFile.createNewFile()); + + final WalkerTestSpec spec = new WalkerTestSpec( + " -T TestPrintReadsWalker" + + " -R " + dummyReference.getAbsolutePath() + + " -I " + privateTestDir + "NA12878.4.snippet.bam" + + " -o %s", + 1, + UserException.MissingReferenceDictFile.class + ); + + executeTest("testReferenceWithMissingDictFile", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java new file mode 100644 index 000000000..a77c0961c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java @@ -0,0 +1,208 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.codecs.table.TableFeature; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet.RMDStorageType; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import static org.testng.Assert.assertTrue; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +/** + * User: hanna + * Date: May 21, 2009 + * Time: 11:03:04 AM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Test the contents and number of iterators in the pool. + */ + +public class ReferenceOrderedDataPoolUnitTest extends BaseTest { + + private RMDTriplet triplet = null; + private RMDTrackBuilder builder = null; + + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + private GenomeLoc testSite1; + private GenomeLoc testSite2; + private GenomeLoc testSite3; + + private GenomeLoc testInterval1; // an interval matching testSite1 -> testSite2 for queries + private GenomeLoc testInterval2; // an interval matching testSite2 -> testSite3 for queries + + + @BeforeClass + public void init() throws FileNotFoundException { + seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(seq); + + testSite1 = genomeLocParser.createGenomeLoc("chrM",10); + testSite2 = genomeLocParser.createGenomeLoc("chrM",20); + testSite3 = genomeLocParser.createGenomeLoc("chrM",30); + testInterval1 = genomeLocParser.createGenomeLoc("chrM",10,20); + testInterval2 = genomeLocParser.createGenomeLoc("chrM",20,30); + } + + @BeforeMethod + public void setUp() { + String fileName = privateTestDir + "TabularDataTest.dat"; + + triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); + // disable auto-index creation/locking in the RMDTrackBuilder for tests + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); + } + + @Test + public void testCreateSingleIterator() { + ResourcePool iteratorPool = new ReferenceOrderedDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); + LocationAwareSeekableRODIterator iterator = (LocationAwareSeekableRODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) ); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); + + assertTrue(datum.getLocation().equals(testSite1)); + assertTrue(datum.get("COL1").equals("A")); + assertTrue(datum.get("COL2").equals("B")); + assertTrue(datum.get("COL3").equals("C")); + + iteratorPool.release(iterator); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); + } + + @Test + public void testCreateMultipleIterators() { + ReferenceOrderedQueryDataPool iteratorPool = new ReferenceOrderedQueryDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser); + LocationAwareSeekableRODIterator iterator1 = iteratorPool.iterator( new MappedStreamSegment(testInterval1) ); + + // Create a new iterator at position 2. + LocationAwareSeekableRODIterator iterator2 = iteratorPool.iterator( new MappedStreamSegment(testInterval2) ); + + Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + // Test out-of-order access: first iterator2, then iterator1. + // Ugh...first call to a region needs to be a seek. + TableFeature datum = (TableFeature)iterator2.seekForward(testSite2).get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite2)); + assertTrue(datum.get("COL1").equals("C")); + assertTrue(datum.get("COL2").equals("D")); + assertTrue(datum.get("COL3").equals("E")); + + datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite1)); + assertTrue(datum.get("COL1").equals("A")); + assertTrue(datum.get("COL2").equals("B")); + assertTrue(datum.get("COL3").equals("C")); + + // Advance iterator2, and make sure both iterator's contents are still correct. + datum = (TableFeature)iterator2.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite3)); + assertTrue(datum.get("COL1").equals("F")); + assertTrue(datum.get("COL2").equals("G")); + assertTrue(datum.get("COL3").equals("H")); + + datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite2)); + assertTrue(datum.get("COL1").equals("C")); + assertTrue(datum.get("COL2").equals("D")); + assertTrue(datum.get("COL3").equals("E")); + + // Cleanup, and make sure the number of iterators dies appropriately. + iteratorPool.release(iterator1); + + Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); + + iteratorPool.release(iterator2); + + Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 2, "Number of available iterators in the pool is incorrect"); + } + + @Test + public void testIteratorConservation() { + ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); + LocationAwareSeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite1)); + assertTrue(datum.get("COL1").equals("A")); + assertTrue(datum.get("COL2").equals("B")); + assertTrue(datum.get("COL3").equals("C")); + + iteratorPool.release(iterator); + + // Create another iterator after the current iterator. + iterator = iteratorPool.iterator( new MappedStreamSegment(testSite3) ); + + // Make sure that the previously acquired iterator was reused. + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); + + datum = (TableFeature)iterator.seekForward(testSite3).get(0).getUnderlyingObject(); + assertTrue(datum.getLocation().equals(testSite3)); + assertTrue(datum.get("COL1").equals("F")); + assertTrue(datum.get("COL2").equals("G")); + assertTrue(datum.get("COL3").equals("H")); + + iteratorPool.release(iterator); + + Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); + Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java new file mode 100644 index 000000000..514b85737 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java @@ -0,0 +1,89 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.datasources.rmd; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.refdata.utils.*; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +public class ReferenceOrderedQueryDataPoolUnitTest extends BaseTest{ + @Test + public void testCloseFilePointers() throws IOException { + // Build up query parameters + File file = new File(BaseTest.privateTestDir + "NA12878.hg19.example1.vcf"); + RMDTriplet triplet = new RMDTriplet("test", "VCF", file.getAbsolutePath(), RMDTriplet.RMDStorageType.FILE, new Tags()); + IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); + GenomeLocParser parser = new GenomeLocParser(seq); + GenomeLoc loc = parser.createGenomeLoc("20", 1, 100000); + TestRMDTrackBuilder builder = new TestRMDTrackBuilder(seq.getSequenceDictionary(), parser); + + // Create the query data pool + ReferenceOrderedQueryDataPool pool = new ReferenceOrderedQueryDataPool(triplet, builder, seq.getSequenceDictionary(), parser); + + for (int i = 0; i < 3; i++) { + // Ensure our tribble iterators are closed. + CheckableCloseableTribbleIterator.clearThreadIterators(); + Assert.assertTrue(CheckableCloseableTribbleIterator.getThreadIterators().isEmpty(), "Tribble iterators list was not cleared."); + + // Request the the rodIterator + LocationAwareSeekableRODIterator rodIterator = pool.iterator(new MappedStreamSegment(loc)); + + // Run normal iteration over rodIterator + Assert.assertTrue(rodIterator.hasNext(), "Rod iterator does not have a next value."); + GenomeLoc rodIteratorLocation = rodIterator.next().getLocation(); + Assert.assertEquals(rodIteratorLocation.getContig(), "20", "Instead of chr 20 rod iterator was at location " + rodIteratorLocation); + + // Check that the underlying tribbleIterators are still open. + List> tribbleIterators = CheckableCloseableTribbleIterator.getThreadIterators(); + Assert.assertFalse(tribbleIterators.isEmpty(), "Tribble iterators list is empty"); + for (CheckableCloseableTribbleIterator tribbleIterator: tribbleIterators) { + Assert.assertFalse(tribbleIterator.isClosed(), "Tribble iterator is closed but should be still open."); + } + + // Releasing the rodIterator should close the underlying tribbleIterator. + pool.release(rodIterator); + + // Check that the underlying tribbleIterators are now closed. + for (CheckableCloseableTribbleIterator tribbleIterator: tribbleIterators) { + Assert.assertTrue(tribbleIterator.isClosed(), "Tribble iterator is open but should be now closed."); + } + } + + // Extra cleanup. + CheckableCloseableTribbleIterator.clearThreadIterators(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java new file mode 100644 index 000000000..c98243adc --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java @@ -0,0 +1,45 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + +public class DownsamplingIntegrationTest extends WalkerTest { + + @Test + public void testDetectLowDcovValueWithLocusTraversal() { + final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestCountLociWalker -R " + publicTestDir + "exampleFASTA.fasta -I " + publicTestDir + "exampleBAM.bam -o %s " + + "-dcov " + (DownsamplingMethod.MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS - 1), + 1, + UserException.class + ); + executeTest("testDetectLowDcovValueWithLocusTraversal", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..27804c6d1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,141 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingReadsIterator; +import org.broadinstitute.gatk.utils.downsampling.SimplePositionalDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class DownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class DownsamplingReadsIteratorTest extends TestDataProvider { + private DownsamplingReadsIterator downsamplingIter; + private int targetCoverage; + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { + super(DownsamplingReadsIteratorTest.class); + + this.stream = stream; + this.targetCoverage = targetCoverage; + + setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + targetCoverage, + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); + downsamplingIter = new DownsamplingReadsIterator(stream.getGATKSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); + + streamAnalyzer.analyze(downsamplingIter); + + // Check whether the observed properties of the downsampled stream are what they should be + streamAnalyzer.validate(); + + // Allow memory used by this test to be reclaimed + stream = null; + streamAnalyzer = null; + downsamplingIter = null; + } + } + + @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") + public Object[][] createDownsamplingReadsIteratorTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + // Values that don't vary across tests + int targetCoverage = 10; + int minReadLength = 50; + int maxReadLength = 100; + int minDistanceBetweenStacks = 1; + int maxDistanceBetweenStacks = maxReadLength + 1; + + Utils.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs : Arrays.asList(1, 2, 5) ) { + for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { + for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { + // Only interested in sane read stream configurations here + if ( minReadsPerStack <= maxReadsPerStack ) { + new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads), + targetCoverage); + } + } + } + } + } + } + + return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") + public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java new file mode 100644 index 000000000..8e3ac5f49 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java @@ -0,0 +1,160 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.FractionalDownsampler; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +public class FractionalDownsamplerUnitTest extends BaseTest { + + private static class FractionalDownsamplerTest extends TestDataProvider { + double fraction; + int totalReads; + int expectedMinNumReadsAfterDownsampling; + int expectedMaxNumReadsAfterDownsampling; + int expectedMinDiscardedItems; + int expectedMaxDiscardedItems; + + private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent + + public FractionalDownsamplerTest( double fraction, int totalReads ) { + super(FractionalDownsamplerTest.class); + + this.fraction = fraction; + this.totalReads = totalReads; + + calculateExpectations(); + + setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", + getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); + } + + private void calculateExpectations() { + // Require an exact match in the 0% and 100% cases + if ( fraction == 0.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; + expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; + } + else if ( fraction == 1.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; + expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; + } + else { + expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); + expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); + expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; + expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; + } + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "FractionalDownsamplerTestDataProvider") + public Object[][] createFractionalDownsamplerTestData() { + for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { + for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { + new FractionalDownsamplerTest(fraction, totalReads); + } + } + + return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); + } + + @Test(dataProvider = "FractionalDownsamplerTestDataProvider") + public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && + downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); + + Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && + downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..74a936782 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,165 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.Downsampler; +import org.broadinstitute.gatk.utils.downsampling.LevelingDownsampler; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.*; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + final int sizeFromDownsampler = downsampler.size(); + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + Assert.assertEquals(sizeFromDownsampler, totalRemainingItems); + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..fdc8587ba --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,302 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.PerSampleDownsamplingReadsIterator; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsamplerFactory; +import org.broadinstitute.gatk.utils.downsampling.SimplePositionalDownsamplerFactory; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.engine.iterators.VerifyingSamIterator; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.ArtificialMultiSampleReadStream; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { + + // TODO: tests should distinguish between variance across samples and variance within a sample + + private enum StreamDensity { + SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), + DENSE (1, MIN_READ_LENGTH), + MIXED (1, MAX_READ_LENGTH * 2), + UNIFORM_DENSE (1, 1), + UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); + + int minDistanceBetweenStacks; + int maxDistanceBetweenStacks; + + StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + } + + public String toString() { + return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + private enum StreamStackDepth { + NON_UNIFORM_LOW (1, 5), + NON_UNIFORM_HIGH (15, 20), + NON_UNIFORM_MIXED (1, 20), + UNIFORM_SINGLE (1, 1), + UNIFORM_LOW (2, 2), + UNIFORM_HIGH (20, 20), + UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing + + int minReadsPerStack; + int maxReadsPerStack; + + StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + } + + public boolean isUniform() { + return minReadsPerStack == maxReadsPerStack; + } + + public String toString() { + return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); + } + } + + private enum StreamStacksPerContig { + UNIFORM(20, 20), + NON_UNIFORM(1, 30); + + int minStacksPerContig; + int maxStacksPerContig; + + StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { + this.minStacksPerContig = minStacksPerContig; + this.maxStacksPerContig = maxStacksPerContig; + } + + public boolean isUniform() { + return minStacksPerContig == maxStacksPerContig; + } + + public String toString() { + return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); + } + } + + // Not interested in testing multiple ranges for the read lengths, as none of our current + // downsamplers are affected by read length + private static final int MIN_READ_LENGTH = 50; + private static final int MAX_READ_LENGTH = 150; + + private ReadsDownsamplerFactory downsamplerFactory; + private int targetCoverage; + private int numSamples; + private int minContigs; + private int maxContigs; + private StreamDensity streamDensity; + private StreamStackDepth streamStackDepth; + private StreamStacksPerContig streamStacksPerContig; + private double unmappedReadsFraction; + private int unmappedReadsCount; + private boolean verifySortedness; + + private ArtificialMultiSampleReadStream mergedReadStream; + private Map perSampleArtificialReadStreams; + private Map perSampleStreamAnalyzers; + private SAMFileHeader header; + + public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, + int targetCoverage, + int numSamples, + int minContigs, + int maxContigs, + StreamDensity streamDensity, + StreamStackDepth streamStackDepth, + StreamStacksPerContig streamStacksPerContig, + double unmappedReadsFraction, + int unmappedReadsCount, + boolean verifySortedness ) { + super(PerSampleDownsamplingReadsIteratorTest.class); + + this.downsamplerFactory = downsamplerFactory; + this.targetCoverage = targetCoverage; + this.numSamples = numSamples; + this.minContigs = minContigs; + this.maxContigs = maxContigs; + this.streamDensity = streamDensity; + this.streamStackDepth = streamStackDepth; + this.streamStacksPerContig = streamStacksPerContig; + this.unmappedReadsFraction = unmappedReadsFraction; + this.unmappedReadsCount = unmappedReadsCount; + this.verifySortedness = verifySortedness; + + header = createHeader(); + createReadStreams(); + + setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", + getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); + } + + private SAMFileHeader createHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); + List readGroups = new ArrayList(numSamples); + List sampleNames = new ArrayList(numSamples); + + for ( int i = 0; i < numSamples; i++ ) { + readGroups.add("ReadGroup" + i); + sampleNames.add("Sample" + i); + } + + return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); + } + + private void createReadStreams() { + perSampleArtificialReadStreams = new HashMap(numSamples); + perSampleStreamAnalyzers = new HashMap(numSamples); + + for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { + String readGroupID = readGroup.getReadGroupId(); + String sampleName = readGroup.getSample(); + + int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); + int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); + + int thisSampleNumUnmappedReads = Utils.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; + + ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, + readGroupID, + thisSampleNumContigs, + thisSampleStacksPerContig, + streamStackDepth.minReadsPerStack, + streamStackDepth.maxReadsPerStack, + streamDensity.minDistanceBetweenStacks, + streamDensity.maxDistanceBetweenStacks, + MIN_READ_LENGTH, + MAX_READ_LENGTH, + thisSampleNumUnmappedReads); + perSampleArtificialReadStreams.put(sampleName, thisSampleStream); + perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); + } + + mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); + } + + public void run() { + GATKSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getGATKSAMIterator(), downsamplerFactory); + + if ( verifySortedness ) { + downsamplingIter = new VerifyingSamIterator(downsamplingIter); + } + + while ( downsamplingIter.hasNext() ) { + SAMRecord read = downsamplingIter.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); + if ( analyzer != null ) { + analyzer.update(read); + } + else { + throw new ReviewedGATKException("bug: stream analyzer for sample " + sampleName + " not found"); + } + } + + for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { + ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); + analyzer.finalizeStats(); + + // Validate the downsampled read stream for each sample individually + analyzer.validate(); + } + + // Allow memory used by this test to be reclaimed: + mergedReadStream = null; + perSampleArtificialReadStreams = null; + perSampleStreamAnalyzers = null; + } + } + + @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public Object[][] createPerSampleDownsamplingReadsIteratorTests() { + + Utils.resetRandomGenerator(); + + // Some values don't vary across tests + int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; + ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); + int maxContigs = 3; + boolean verifySortedness = true; + + for ( int numSamples : Arrays.asList(1, 2, 10) ) { + for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { + for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { + for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { + for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { + new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, + targetCoverage, + numSamples, + minContigs, + maxContigs, + streamDensity, + streamStackDepth, + streamStacksPerContig, + unmappedReadsFraction, + unmappedReadsCount, + verifySortedness); + } + } + } + } + } + } + } + + return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..88a1c5d5c --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,133 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsampler; +import org.broadinstitute.gatk.utils.downsampling.ReservoirDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + Assert.assertEquals(downsampler.size(), test.expectedNumReadsAfterDownsampling); + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..c22a3eaed --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,333 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.downsampling; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.downsampling.ReadsDownsampler; +import org.broadinstitute.gatk.utils.downsampling.SimplePositionalDownsampler; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + Utils.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(Utils.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.resetStats(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/executive/ReduceTreeUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java new file mode 100644 index 000000000..d3fb18896 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java @@ -0,0 +1,77 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Collections; + + +/** + * Tests for the {@link MalformedReadFilter} when the unsafe flag + * {@link ValidationExclusion.TYPE#ALLOW_N_CIGAR_READS} is set. + * + * @author Valentin Ruano-Rubio + * @since 6/6/13 + */ +public class AllowNCigarMalformedReadFilterUnitTest extends MalformedReadFilterUnitTest { + + + @Override + protected ValidationExclusion composeValidationExclusion() { + return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS)); + } + + + @Test(enabled = true, + dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.IGNORE) + public void testCigarNOperatorFilterIgnore(final String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(false); + final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); + Assert.assertFalse(filter.filterOut(nContainingCigarRead), + "filters out N containing Cigar when it should ignore the fact"); + } + + @Test(enabled = false) + @Override + public void testCigarNOperatorFilterException(final String cigarString) { + // Nothing to do here. + // Just deactivates the parents test case. + } + + + + + + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/BadCigarFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadCigarFilterUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/BadCigarFilterUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadCigarFilterUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java new file mode 100644 index 000000000..f4232067d --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.Test; + + +public class BadReadGroupsIntegrationTest extends WalkerTest { + + @Test + public void testMissingReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", + 0, + UserException.ReadMissingReadGroup.class); + executeTest("test Missing Read Group", spec); + } + + @Test + public void testUndefinedReadGroup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T TestPrintReadsWalker -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", + 0, + UserException.ReadHasUndefinedReadGroup.class); + executeTest("test Undefined Read Group", spec); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java new file mode 100644 index 000000000..20566f510 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java @@ -0,0 +1,246 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.TextCigarCodec; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.exceptions.UserException.UnsupportedCigarOperatorException; + +import java.lang.annotation.*; +import java.lang.reflect.Method; +import java.util.*; + + +/** + * Tests for the MalformedReadFilter + * + * @author Eric Banks + * @since 3/14/13 + */ +public class MalformedReadFilterUnitTest extends ReadFilterTest { + + ////////////////////////////////////// + // Test the checkSeqStored() method // + ////////////////////////////////////// + + @Test(enabled = true) + public void testCheckSeqStored () { + + final GATKSAMRecord goodRead = ArtificialSAMUtils.createArtificialRead(new byte[]{(byte)'A'}, new byte[]{(byte)'A'}, "1M"); + final GATKSAMRecord badRead = ArtificialSAMUtils.createArtificialRead(new byte[]{}, new byte[]{}, "1M"); + badRead.setReadString("*"); + + Assert.assertTrue(MalformedReadFilter.checkSeqStored(goodRead, true)); + Assert.assertFalse(MalformedReadFilter.checkSeqStored(badRead, true)); + + try { + MalformedReadFilter.checkSeqStored(badRead, false); + Assert.assertTrue(false, "We should have exceptioned out in the previous line"); + } catch (UserException e) { } + } + + @Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.FILTER) + public void testCigarNOperatorFilterTruePositive(String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(true); + final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); + Assert.assertTrue(filter.filterOut(nContainingCigarRead), + " Did not filtered out a N containing CIGAR read"); + } + + @Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT) + public void testCigarNOperatorFilterTrueNegative(String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(true); + final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString); + Assert.assertFalse(filter.filterOut(nonNContainingCigarRead), + " Filtered out a non-N containing CIGAR read"); + } + + @Test(enabled = true, + expectedExceptions = UnsupportedCigarOperatorException.class, + dataProvider= "UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.EXCEPTION) + public void testCigarNOperatorFilterException(final String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(false); + final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); + + filter.filterOut(nContainingCigarRead); + } + + @Test(enabled = true, dataProvider="UnsupportedCigarOperatorDataProvider") + @CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT) + public void testCigarNOperatorFilterControl(final String cigarString) { + + final MalformedReadFilter filter = buildMalformedReadFilter(false); + final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString); + + Assert.assertFalse(filter.filterOut(nonNContainingCigarRead)); + } + + protected SAMRecord buildSAMRecord(final String cigarString) { + final Cigar nContainingCigar = TextCigarCodec.getSingleton().decode(cigarString); + return this.createRead(nContainingCigar, 1, 0, 10); + } + + protected MalformedReadFilter buildMalformedReadFilter(final boolean filterRNO) { + return buildMalformedReadFiter(filterRNO,new ValidationExclusion.TYPE[] {}); + } + + protected MalformedReadFilter buildMalformedReadFiter(boolean filterRNO, final ValidationExclusion.TYPE... excl) { + final ValidationExclusion ve = new ValidationExclusion(Arrays.asList(excl)); + + final MalformedReadFilter filter = new MalformedReadFilter(); + + final SAMFileHeader h = getHeader(); + final SAMDataSource ds = getDataSource(); + + final GenomeAnalysisEngine gae = new GenomeAnalysisEngine() { + @Override + public SAMFileHeader getSAMFileHeader() { + return h; + } + + @Override + public SAMDataSource getReadsDataSource() { + return ds; + } + }; + filter.initialize(gae); + filter.filterReadsWithNCigar = filterRNO; + return filter; + } + + @Retention(RetentionPolicy.RUNTIME) + @Target(ElementType.METHOD) + @Inherited + protected @interface CigarOperatorTest { + + enum Outcome { + ANY,ACCEPT,FILTER,EXCEPTION,IGNORE; + + public boolean appliesTo (String cigar) { + boolean hasN = cigar.indexOf('N') != -1; + switch (this) { + case ANY: return true; + case ACCEPT: return !hasN; + case IGNORE: return hasN; + case FILTER: + case EXCEPTION: + default: + return hasN; + + } + } + } + + Outcome value() default Outcome.ANY; + } + + /** + * Cigar test data for unsupported operator test. + * Each element of this array corresponds to a test case. In turn the first element of the test case array is the + * Cigar string for that test case and the second indicates whether it should be filtered due to the presence of a + * unsupported operator + */ + private static final String[] TEST_CIGARS = { + "101M10D20I10M", + "6M14N5M", + "1N", + "101M", + "110N", + "2N4M", + "4M2N", + "3M1I1M", + "1M2I2M", + "1M10N1I1M", + "1M1I1D", + "11N12M1I34M12N" + }; + + @DataProvider(name= "UnsupportedCigarOperatorDataProvider") + public Iterator unsupportedOperatorDataProvider(final Method testMethod) { + final CigarOperatorTest a = resolveCigarOperatorTestAnnotation(testMethod); + final List result = new LinkedList(); + for (final String cigarString : TEST_CIGARS) { + if (a == null || a.value().appliesTo(cigarString)) { + result.add(new Object[] { cigarString }); + } + } + return result.iterator(); + } + + /** + * Gets the most specific {@link CigarOperatorTest} annotation for the + * signature of the test method provided. + *

+ * This in-house implementation is required due to the fact that method + * annotations do not have inheritance. + * + * @param m targeted test method. + * @return null if there is no {@link CigarOperatorTest} + * annotation in this or overridden methods. + */ + private CigarOperatorTest resolveCigarOperatorTestAnnotation(final Method m) { + CigarOperatorTest res = m.getAnnotation(CigarOperatorTest.class); + if (res != null) { + return res; + } + Class c = this.getClass(); + Class p = c.getSuperclass(); + while (p != null && p != Object.class) { + try { + final Method met = p.getDeclaredMethod(m.getName(), + m.getParameterTypes()); + res = met.getAnnotation(CigarOperatorTest.class); + if (res != null) { + break; + } + } catch (NoSuchMethodException e) { + // Its ok; nothing to do here, just keep looking. + } + c = p; + p = c.getSuperclass(); + } + return res; + } + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformerUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java new file mode 100644 index 000000000..4e801815f --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java @@ -0,0 +1,370 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; + +import java.util.*; + +/** + * Class ReadBaseTest + *

+ * This is the base test class for read filter test classes. All read + * filter test cases should extend from this + * class; it sets ups a header mock up to test read filtering. + * + * Feel free to override non-final method to modify the behavior + * (i.e. change how read group id are formatted, or complete a header). + * + *

+ * You can statically determine the number of read-group involved + * in the test by calling {@link #ReadFilterTest(int)} in you constructor. + *

+ * + * Notice that the same header object is shared by all test and + * it is initialized by Junit (calling {@link #beforeClass()}. + * + * @author Valentin Ruano Rubio + * @date May 23, 2013 + */ +public class ReadFilterTest extends BaseTest { + + private static final int DEFAULT_READ_GROUP_COUNT = 5; + private static final int DEFAULT_READER_COUNT = 1; + private static final String DEFAULT_READ_GROUP_PREFIX = "ReadGroup"; + private static final String DEFAULT_PLATFORM_UNIT_PREFIX = "Lane"; + private static final String DEFAULT_SAMPLE_NAME_PREFIX = "Sample"; + private static final String DEFAULT_PLATFORM_PREFIX = "Platform"; + private static final int DEFAULT_CHROMOSOME_COUNT = 1; + private static final int DEFAULT_CHROMOSOME_START_INDEX = 1; + private static final int DEFAULT_CHROMOSOME_SIZE = 1000; + private static final String DEFAULT_SAM_FILE_FORMAT = "readfile-%3d.bam"; + + private final int groupCount; + + private SAMFileHeader header; + + private SAMDataSource dataSource; + + /** + * Constructs a new read-filter test providing the number of read + * groups in the file. + * + * @param groupCount number of read-group in the fictional SAM file, + * must be equal or greater than 1. + */ + protected ReadFilterTest(final int groupCount) { + if (groupCount < 1) { + throw new IllegalArgumentException( + "the read group count must at least be 1"); + } + this.groupCount = groupCount; + } + + + /** + * Gets the data source. + * + * @throws IllegalStateException if the data source was not initialized + * invoking {@link #beforeClass()} + * @return never null + */ + protected final SAMDataSource getDataSource() { + checkDataSourceExists(); + return dataSource; + } + + /** + * Returns the mock-up SAM file header for testing. + * + * @throws IllegalStateException if the header was not initialized + * invoking {@link #beforeClass()} + * @return never null + */ + protected final SAMFileHeader getHeader() { + checkHeaderExists(); + return header; + } + + /** + * Construct a read filter test with the default number of groups + * ({@link #DEFAULT_READ_GROUP_COUNT}. + */ + public ReadFilterTest() { + this(DEFAULT_READ_GROUP_COUNT); + } + + /** + * Return the number of read groups involved in the test + * @return 1 or greater. + */ + protected final int getReadGroupCount() { + return groupCount; + } + + /** + * Composes the Id for the read group given its index. + * + * This methods must return a unique distinct ID for each possible index and + * it must be the same value each time it is invoked. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null and must be unique to each possible + * read group index. + */ + protected String composeReadGroupId(final int index) { + checkReadGroupIndex(index); + return DEFAULT_READ_GROUP_PREFIX + index; + } + + /** + * Composes the Platform name for the read group given its index. + * + * This method must always return the same value give an index. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected String composePlatformName(final int index) { + checkReadGroupIndex(index); + return DEFAULT_PLATFORM_PREFIX + (((index-1)%2)+1); + } + + + /** + * Composes the Platform unit name for the read group given its index. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected String composePlatformUnitName(final int index) { + checkReadGroupIndex(index); + return DEFAULT_PLATFORM_UNIT_PREFIX + (((index-1)%3)+1); + } + + + + /** + * Checks the correctness of a given read group index. + * + * A correct index is any value in the range [1,{@link #getReadGroupCount()}]. + * + * @param index the target index. + * @throws IllegalArgumentException if the input index is not correct. + */ + protected final void checkReadGroupIndex(final int index) { + checkIndex(index,groupCount,"read group"); + } + + + private void checkIndex(final int index, final int max, CharSequence name) { + if (index < 1 || index > max) { + throw new IllegalArgumentException( + name + " index (" + + index + + ") is out of bounds [1," + max + "]"); + } + } + + + /** + * Checks whether the header was initialized. + * + * @throws IllegalStateException if the header was not yet initialized. + */ + protected final void checkHeaderExists() { + if (header == null) { + throw new IllegalArgumentException( + "header has not been initialized;" + + " beforeClass() was not invoked"); + } + } + + /** + * Checks whether the data source was initialized. + * + * @throws IllegalStateException if the data source was not yet initialized. + */ + protected final void checkDataSourceExists() { + if (header == null) { + throw new IllegalArgumentException( + "data source has not been initialized;" + + " beforeClass() was not invoked"); + } + } + + /** + * Returns the ID for a read group given its index. + * + * @param index the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null and must be unique to each + * possible read group index. + */ + protected final String getReadGroupId(final int index) { + checkReadGroupIndex(index); + return getHeader().getReadGroups().get(index - 1).getReadGroupId(); + } + + /** + * Returns the platform name for a read group given its index. + * + * @param group the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected final String getPlatformName(final int group) { + checkReadGroupIndex(group); + return getHeader().getReadGroups().get(group - 1).getPlatform(); + } + + /** + * Returns the platform unit for a read group given its index. + * + * @param group the index of the targeted read group in the range + * [1,{@link #getReadGroupCount()}] + * @return never null. + */ + protected final String getPlatformUnit(final int group) { + checkReadGroupIndex(group); + return getHeader().getReadGroups().get(group - 1).getPlatformUnit(); + } + + + /** + * Composes the mock up SAM file header. + * + * It must return an equivalent (equal) value each time it is invoked. + * + * @return never null. + */ + protected SAMFileHeader composeHeader() { + + return ArtificialSAMUtils.createArtificialSamHeader( + DEFAULT_CHROMOSOME_COUNT, DEFAULT_CHROMOSOME_START_INDEX, + DEFAULT_CHROMOSOME_SIZE); + } + + @BeforeClass + public void beforeClass() { + + header = composeHeader(); + dataSource = composeDataSource(); + final List readGroupIDs = new ArrayList(); + final List sampleNames = new ArrayList(); + + for (int i = 1; i <= getReadGroupCount(); i++) { + final String readGroupId = composeReadGroupId(i); + readGroupIDs.add(readGroupId); + sampleNames.add(readGroupId); + } + + ArtificialSAMUtils.createEnumeratedReadGroups( + header, readGroupIDs, sampleNames); + + for (int i = 1; i <= getReadGroupCount(); i++) { + final String readGroupId = readGroupIDs.get(i-1); + final SAMReadGroupRecord groupRecord = header.getReadGroup(readGroupId); + groupRecord.setAttribute("PL", composePlatformName(i)); + groupRecord.setAttribute("PU", composePlatformUnitName(i)); + } + + } + + protected ValidationExclusion composeValidationExclusion() { + return new ValidationExclusion(); + } + + protected SAMDataSource composeDataSource() { + checkHeaderExists(); + final Set readerIDs = new HashSet<>(1); + final ThreadAllocation ta = new ThreadAllocation(); + final Integer numFileHandles = 1; // I believe that any value would do but need to confirm. + final boolean useOriginalBaseQualities = true; + final ValidationStringency strictness = ValidationStringency.LENIENT; + final Integer readBufferSize = 1; // not relevant. + final DownsamplingMethod downsamplingMethod = DownsamplingMethod.NONE; + final ValidationExclusion exclusionList = composeValidationExclusion(); + final Collection supplementalFilters = Collections.EMPTY_SET; + final boolean includeReadsWithDeletionAtLoci = true; + + final GenomeLocParser glp = new GenomeLocParser(header.getSequenceDictionary()); + final SAMDataSource res = new SAMDataSource( + readerIDs, + ta, + numFileHandles, + glp, + useOriginalBaseQualities, + strictness, + readBufferSize, + downsamplingMethod, + exclusionList, + supplementalFilters, + includeReadsWithDeletionAtLoci); + + return res; + } + + @AfterClass + public void afterClass() { + header = null; + dataSource = null; + } + + /** + * Creates a read record. + * + * @param cigar the new record CIGAR. + * @param group the new record group index that must be in the range \ + * [1,{@link #getReadGroupCount()}] + * @param reference the reference sequence index (0-based) + * @param start the start position of the read alignment in the reference + * (1-based) + * @return never null + */ + protected SAMRecord createRead(final Cigar cigar, final int group, final int reference, final int start) { + final SAMRecord record = ArtificialSAMUtils.createArtificialRead(cigar); + record.setHeader(getHeader()); + record.setAlignmentStart(start); + record.setReferenceIndex(reference); + record.setAttribute(SAMTag.RG.toString(), getReadGroupId(group)); + return record; + + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/ReadGroupBlackListFilterUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java new file mode 100644 index 000000000..343ad656e --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java @@ -0,0 +1,50 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.filters; + + +import org.broadinstitute.gatk.utils.ValidationExclusion; + +import java.util.Collections; + + +/** + * Tests for the {@link MalformedReadFilter} when the unsafe flag + * {@link ValidationExclusion.TYPE#ALL} is set. + * + * @author Valentin Ruano-Rubio + * @since 6/6/13 + */ +public class UnsafeMalformedReadFilterUnitTest extends AllowNCigarMalformedReadFilterUnitTest { + + + @Override + protected ValidationExclusion composeValidationExclusion() { + return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)); + } + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/OutputTrackerUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/stubs/ArgumentTypeDescriptorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/stubs/ArgumentTypeDescriptorUnitTest.java new file mode 100644 index 000000000..60e529281 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/io/stubs/ArgumentTypeDescriptorUnitTest.java @@ -0,0 +1,233 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.io.stubs; + +import htsjdk.variant.variantcontext.VariantContext; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; +import htsjdk.samtools.SAMFileWriter; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Arrays; +import java.util.Collection; + + +public class ArgumentTypeDescriptorUnitTest extends BaseTest { + + //////////////////////////////////////////////////////////////////// + // This section tests the functionality of the @Output annotation // + //////////////////////////////////////////////////////////////////// + + private class ATDTestCommandLineProgram extends CommandLineProgram { + public int execute() { return 0; } + + @Override + public Collection getArgumentTypeDescriptors() { + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + return Arrays.asList( new SAMFileWriterArgumentTypeDescriptor(engine, System.out), + new OutputStreamArgumentTypeDescriptor(engine, System.out), + new VCFWriterArgumentTypeDescriptor(engine, System.out, null)); + } + + protected abstract class ATDTestOutputArgumentSource { + public abstract Object getOut(); + } + + protected class OutputRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = true) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredNoDefaultStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) + public PrintStream out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredSamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public SAMFileWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public VariantContextWriter out; + public Object getOut() { return out; } + } + + protected class OutputNotRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { + @Output(shortName="o", doc="output file", required = false) + public PrintStream out; + public Object getOut() { return out; } + } + } + + @DataProvider(name = "OutputProvider") + public Object[][] OutputProvider() { + + ObjectArrayList tests = new ObjectArrayList(); + + final ATDTestCommandLineProgram clp = new ATDTestCommandLineProgram(); + + for ( final Object obj : Arrays.asList(clp.new OutputRequiredSamArgumentSource(), clp.new OutputRequiredVcfArgumentSource(), clp.new OutputRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, true, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredSamArgumentSource(), clp.new OutputNotRequiredVcfArgumentSource(), clp.new OutputNotRequiredStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, true, provided}); + } + } + + for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredNoDefaultSamArgumentSource(), clp.new OutputNotRequiredNoDefaultVcfArgumentSource(), clp.new OutputNotRequiredNoDefaultStreamArgumentSource()) ) { + for ( final boolean provided : Arrays.asList(true, false) ) { + tests.add(new Object[]{obj, false, false, provided}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "OutputProvider") + public void testOutput(final ATDTestCommandLineProgram.ATDTestOutputArgumentSource argumentSource, final boolean required, final boolean hasDefault, final boolean provided) { + + final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); + parser.addArgumentSource(argumentSource.getClass()); + parser.parse(provided ? new String[] {"out", "foo"} : new String[] {}); + + try { + parser.loadArgumentsIntoObject(argumentSource); + + if ( !provided && (required || !hasDefault) ) + Assert.assertEquals(argumentSource.getOut(), null); + else if ( !provided ) + Assert.assertNotEquals(argumentSource.getOut(), null); + else if ( argumentSource.getOut() == null || !(argumentSource.getOut() instanceof SAMFileWriterStub) ) // can't test this one case + Assert.assertEquals(!provided, outputIsStdout(argumentSource.getOut())); + + } catch (Exception e) { + throw new ReviewedGATKException(e.getMessage()); + } + } + + @Test + public void testRodBindingsCollection() { + + final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); + + //A list file containing a single VCF + final File listFile = createTempListFile("oneVCF", privateTestDir + "empty.vcf"); + + try { + Object result = ArgumentTypeDescriptor.getRodBindingsCollection(listFile, + parser, + VariantContext.class, + "variant", + new Tags(), + "variantTest"); + if (!(result instanceof RodBindingCollection)) + throw new ReviewedGATKException("getRodBindingsCollection did not return a RodBindingCollection"); + RodBindingCollection rbc = (RodBindingCollection) result; + + Assert.assertEquals(rbc.getType(), VariantContext.class); + Assert.assertEquals(rbc.getRodBindings().size(), 1); + + } catch (IOException e) { + throw new ReviewedGATKException(e.getMessage(), e); + } + + //The same file, now with an extra blank line + final File listFileWithBlank = createTempListFile("oneVCFwithBlankLine", privateTestDir + "empty.vcf", ""); + try { + Object result = ArgumentTypeDescriptor.getRodBindingsCollection(listFileWithBlank, + parser, + VariantContext.class, + "variant", + new Tags(), + "variantTest"); + if (!(result instanceof RodBindingCollection)) + throw new ReviewedGATKException("getRodBindingsCollection did not return a RodBindingCollection"); + RodBindingCollection rbc = (RodBindingCollection) result; + + Assert.assertEquals(rbc.getType(), VariantContext.class); + Assert.assertEquals(rbc.getRodBindings().size(), 1); + + } catch (IOException e) { + throw new ReviewedGATKException(e.getMessage(), e); + } + } + + private static boolean outputIsStdout(final Object out) { + if ( out == null ) { + return false; + } else if ( out instanceof SAMFileWriterStub ) { + return ((SAMFileWriterStub)out).getOutputStream() != System.out; + } else if ( out instanceof VariantContextWriterStub ) { + return ((VariantContextWriterStub)out).getOutputStream() == System.out; + } else if ( out instanceof OutputStreamStub ) { + return ((OutputStreamStub)out).getOutputStream() == System.out; + } + return false; + } + +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java new file mode 100644 index 000000000..b295e1230 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java @@ -0,0 +1,144 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import static org.testng.Assert.fail; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.testng.Assert; + +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; + +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + * @version 1.0 + * @date Apr 14, 2009 + *

+ * Class BoundedReadIteratorUnitTest + *

+ * tests for the bounded read iterator. + */ +public class BoundedReadIteratorUnitTest extends BaseTest { + + /** the file list and the fasta sequence */ + private List fl; + private ReferenceSequenceFile seq; + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() throws FileNotFoundException { + fl = new ArrayList(); + } + + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testBounding() { + logger.warn("Executing testBounding"); + // total reads expected + final int expected = 20; + // bound by ten reads + BoundedReadIterator iter = new BoundedReadIterator(new testIterator(), expected); + + int count = 0; + for (SAMRecord rec: iter) { + count++; + } + + Assert.assertEquals(count, expected); + } +} + +class testIterator implements GATKSAMIterator { + SAMFileHeader header; + testIterator() { + header = ArtificialSAMUtils.createArtificialSamHeader(1,1,2000); + } + + public void close() { + + } + + public boolean hasNext() { + return true; + } + + public SAMRecord next() { + return ArtificialSAMUtils.createArtificialRead(header,"blah",0,1,100); + } + + public void remove() { + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java new file mode 100644 index 000000000..fc7465de3 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java @@ -0,0 +1,179 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.BaseTest; +import static org.testng.Assert.assertEquals; + +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.testng.annotations.Test; + +import java.util.Iterator; + +/** + * + * User: aaron + * Date: May 13, 2009 + * Time: 6:58:21 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date May 13, 2009 + *

+ * Class GATKSAMIteratorTest + *

+ * Tests the GATKSAMIteratorAdapter class. + */ +public class GATKSAMIteratorAdapterUnitTest extends BaseTest { + + class MyTestIterator implements Iterator { + + public int count = 0; + + public MyTestIterator() { + count = 0; + } + + public boolean hasNext() { + if (count < 100) { + ++count; + return true; + } else { + return false; + } + } + + public SAMRecord next() { + return null; + } + + public void remove() { + throw new UnsupportedOperationException("Unsupported"); + } + } + + class MyTestCloseableIterator implements CloseableIterator { + public int count = 0; + + public MyTestCloseableIterator() { + count = 0; + } + + public boolean hasNext() { + if (count < 100) { + ++count; + return true; + } else { + return false; + } + } + + public SAMRecord next() { + return null; + } + + public void remove() { + throw new UnsupportedOperationException("Unsupported"); + } + + public void close() { + count = -1; + } + } + + + @Test + public void testNormalIterator() { + final int COUNT = 100; + MyTestIterator it = new MyTestIterator(); + + GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); + int countCheck = 0; + while (samIt.hasNext()) { + samIt.next(); + ++countCheck; + //logger.warn("cnt = " + countCheck); + } + + assertEquals(countCheck, COUNT); + + assertEquals(countCheck, COUNT); + } + + @Test + public void testCloseableIterator() { + final int COUNT = 100; + + MyTestCloseableIterator it = new MyTestCloseableIterator(); + + GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); + + int countCheck = 0; + while (samIt.hasNext()) { + samIt.next(); + ++countCheck; + } + + assertEquals(countCheck, COUNT); + } + + @Test + public void testCloseOnCloseableIterator() { + final int COUNT = 100; + + MyTestCloseableIterator it = new MyTestCloseableIterator(); + + GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); + + + int countCheck = 0; + while (samIt.hasNext()) { + samIt.next(); + ++countCheck; + } + + assertEquals(countCheck, COUNT); + + // check to see that the count get's set to -1 + samIt.close(); + assertEquals(it.count, -1); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityUnitTest.java new file mode 100644 index 000000000..994de2b28 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/MisencodedBaseQualityUnitTest.java @@ -0,0 +1,99 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + + +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.gatk.engine.iterators.MisencodedBaseQualityReadTransformer; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Basic unit test for misencoded quals + */ +public class MisencodedBaseQualityUnitTest extends BaseTest { + + private static final String readBases = "AAAAAAAAAA"; + private static final byte[] badQuals = { 59, 60, 62, 63, 64, 61, 62, 58, 57, 56 }; + private static final byte[] goodQuals = { 60, 60, 60, 60, 60, 60, 60, 60, 60, 60 }; + private static final byte[] fixedQuals = { 28, 29, 31, 32, 33, 30, 31, 27, 26, 25 }; + private SAMFileHeader header; + + @BeforeMethod + public void before() { + // reset the read counter so that we are deterministic + MisencodedBaseQualityReadTransformer.currentReadCounter = 0; + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + } + + private GATKSAMRecord createRead(final boolean useGoodBases) { + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, readBases.getBytes(), + useGoodBases ? Arrays.copyOf(goodQuals, goodQuals.length) : + Arrays.copyOf(badQuals, badQuals.length)); + read.setCigarString("10M"); + return read; + } + + @Test(enabled = true) + public void testGoodQuals() { + final List reads = new ArrayList(10000); + for ( int i = 0; i < 10000; i++ ) + reads.add(createRead(true)); + + testEncoding(reads); + } + + @Test(enabled = true, expectedExceptions = {UserException.class}) + public void testBadQualsThrowsError() { + final List reads = new ArrayList(10000); + for ( int i = 0; i < 10000; i++ ) + reads.add(createRead(false)); + + testEncoding(reads); + } + + @Test(enabled = true) + public void testFixBadQuals() { + final GATKSAMRecord read = createRead(false); + final GATKSAMRecord fixedRead = MisencodedBaseQualityReadTransformer.fixMisencodedQuals(read); + for ( int i = 0; i < fixedQuals.length; i++ ) + Assert.assertEquals(fixedQuals[i], fixedRead.getBaseQualities()[i]); + } + + private void testEncoding(final List reads) { + for ( final GATKSAMRecord read : reads ) + MisencodedBaseQualityReadTransformer.checkForMisencodedQuals(read); + } +} \ No newline at end of file diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java new file mode 100644 index 000000000..d6f4be97a --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; + + +public class ReadFormattingIteratorUnitTest extends BaseTest { + + @Test + public void testIteratorConsolidatesCigars() { + final Cigar unconsolidatedCigar = TextCigarCodec.getSingleton().decode("3M0M5M0M"); + final SAMRecord unconsolidatedRead = ArtificialSAMUtils.createArtificialRead(unconsolidatedCigar); + + final GATKSAMIterator readIterator = GATKSAMIteratorAdapter.adapt(Arrays.asList(unconsolidatedRead).iterator()); + final ReadFormattingIterator formattingIterator = new ReadFormattingIterator(readIterator, false, (byte)-1); + final SAMRecord postIterationRead = formattingIterator.next(); + + Assert.assertEquals(postIterationRead.getCigarString(), "8M", "Cigar 3M0M5M0M not consolidated correctly by ReadFormattingIterator"); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java new file mode 100644 index 000000000..c7e7d05d5 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java @@ -0,0 +1,129 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.iterators; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.List; + +/** + * Created by IntelliJ IDEA. + * User: mhanna + * Date: Mar 2, 2011 + * Time: 9:48:10 PM + * To change this template use File | Settings | File Templates. + */ +public class VerifyingSamIteratorUnitTest { + private SAMFileHeader samFileHeader; + + @BeforeClass + public void init() { + SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(); + sequenceDictionary.addSequence(new SAMSequenceRecord("1",500)); + sequenceDictionary.addSequence(new SAMSequenceRecord("2",500)); + + samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(sequenceDictionary); + } + + @Test + public void testSortedReadsBasic() { + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),1,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read2,"Incorrect read in read 2 position"); + Assert.assertFalse(iterator.hasNext(),"Too many reads in iterator"); + } + + @Test + public void testSortedReadsAcrossContigs() { + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read2,"Incorrect read in read 2 position"); + Assert.assertFalse(iterator.hasNext(),"Too many reads in iterator"); + } + + @Test(expectedExceptions=UserException.MissortedBAM.class) + public void testImproperlySortedReads() { + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + + // Should trigger MissortedBAM exception. + iterator.next(); + } + + @Test(expectedExceptions=UserException.MalformedBAM.class) + public void testInvalidAlignment() { + // Create an invalid alignment state. + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),1,10); + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); + read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); + List reads = Arrays.asList(read1,read2); + + VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); + + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); + Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); + + // Should trigger MalformedBAM exception. + iterator.next(); + } + + private SAMSequenceRecord getContig(final int contigIndex) { + return samFileHeader.getSequence(contigIndex); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java new file mode 100644 index 000000000..3042e3082 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java @@ -0,0 +1,358 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.phonehome; + +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.jets3t.service.S3Service; +import org.jets3t.service.S3ServiceException; +import org.jets3t.service.ServiceException; +import org.jets3t.service.model.S3Object; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.FileInputStream; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +public class GATKRunReportUnitTest extends BaseTest { + private final static boolean DEBUG = false; + private static final long S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING = 30 * 1000; + private static final String AWS_DOWNLOADER_CREDENTIALS_PROPERTIES_FILE = privateTestDir + "phonehome/awsDownloaderCredentials.properties"; + + private Walker walker; + private Exception exception; + private GenomeAnalysisEngine engine; + private String downloaderAccessKey; + private String downloaderSecretKey; + + @BeforeClass + public void setup() throws Exception { + walker = new RunReportDummyReadWalker(); + exception = new IllegalArgumentException("javaException"); + engine = new GenomeAnalysisEngine(); + engine.setArguments(new GATKArgumentCollection()); + + Properties awsProperties = new Properties(); + awsProperties.load(new FileInputStream(AWS_DOWNLOADER_CREDENTIALS_PROPERTIES_FILE)); + downloaderAccessKey = awsProperties.getProperty("accessKey"); + downloaderSecretKey = awsProperties.getProperty("secretKey"); + } + + @Test(enabled = ! DEBUG) + public void testAWSKeysAreValid() { + // throws an exception if they aren't + GATKRunReport.checkAWSAreValid(); + } + + @Test(enabled = ! DEBUG) + public void testAccessKey() throws Exception { + testAWSKey(GATKRunReport.getAWSUploadAccessKey(), GATKRunReport.AWS_ACCESS_KEY_MD5); + } + + @Test(enabled = ! DEBUG) + public void testSecretKey() throws Exception { + testAWSKey(GATKRunReport.getAWSUploadSecretKey(), GATKRunReport.AWS_SECRET_KEY_MD5); + } + + private void testAWSKey(final String accessKey, final String expectedMD5) throws Exception { + Assert.assertNotNull(accessKey, "AccessKey should not be null"); + final String actualmd5 = Utils.calcMD5(accessKey); + Assert.assertEquals(actualmd5, expectedMD5); + } + + @DataProvider(name = "GATKReportCreationTest") + public Object[][] makeGATKReportCreationTest() { + List tests = new ArrayList(); + + final Walker readWalker = new RunReportDummyReadWalker(); + final Walker lociWalker = new RunReportDummyLocusWalker(); + final Walker rodWalker = new RunReportDummyRodWalker(); + final Walker artWalker = new RunReportDummyActiveRegionWalker(); + + final Exception noException = null; + final Exception javaException = new IllegalArgumentException("javaException"); + final Exception stingException = new ReviewedGATKException("GATKException"); + final Exception userException = new UserException("userException"); + + final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setArguments(new GATKArgumentCollection()); + + for ( final Walker walker : Arrays.asList(readWalker, lociWalker, rodWalker, artWalker) ) { + for ( final Exception exception : Arrays.asList(noException, javaException, stingException, userException) ) { + tests.add(new Object[]{walker, exception, engine}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "GATKReportCreationTest") + public void testGATKReportCreationReadingAndWriting(final Walker walker, final Exception exception, final GenomeAnalysisEngine engine) throws Exception { + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.STDOUT); + final ByteArrayOutputStream captureStream = new ByteArrayOutputStream(); + final boolean succeeded = report.postReportToStream(captureStream); + Assert.assertTrue(succeeded, "Failed to write report to stream"); + Assert.assertFalse(report.exceptionOccurredDuringPost(), "Post succeeded but report says it failed"); + Assert.assertNull(report.getErrorMessage(), "Post succeeded but there was an error message"); + Assert.assertNull(report.getErrorThrown(), "Post succeeded but there was an error message"); + final InputStream readStream = new ByteArrayInputStream(captureStream.toByteArray()); + + GATKRunReport deserialized = null; + try { + deserialized = GATKRunReport.deserializeReport(readStream); + } catch ( Exception e ) { + final String reportString = new String(captureStream.toByteArray()); + Assert.fail("Failed to deserialize GATK report " + reportString + " with exception " + e); + } + + if ( deserialized != null ) + Assert.assertEquals(report, deserialized); + } + + @DataProvider(name = "GATKAWSReportMode") + public Object[][] makeGATKAWSReportMode() { + List tests = new ArrayList(); + + for ( final GATKRunReport.AWSMode mode : GATKRunReport.AWSMode.values() ) { + tests.add(new Object[]{mode}); + } + + return tests.toArray(new Object[][]{}); + } + + // Will fail with timeout if AWS time out isn't working + // Will fail with exception if AWS doesn't protect itself from errors + @Test(enabled = ! DEBUG, dataProvider = "GATKAWSReportMode", timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) + public void testAWS(final GATKRunReport.AWSMode awsMode) { + logger.warn("Starting testAWS mode=" + awsMode); + + // Use a shorter timeout than usual when we're testing GATKRunReport.AWSMode.TIMEOUT + final long thisTestS3Timeout = awsMode == GATKRunReport.AWSMode.TIMEOUT ? 30 * 1000 : S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING; + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, thisTestS3Timeout); + report.sendAWSToTestBucket(); + report.setAwsMode(awsMode); + final S3Object s3Object = report.postReportToAWSS3(); + + if ( awsMode == GATKRunReport.AWSMode.NORMAL ) { + Assert.assertNotNull(s3Object, "Upload to AWS failed, s3Object was null. error was " + report.formatError()); + Assert.assertFalse(report.exceptionOccurredDuringPost(), "The upload should have succeeded but the report says it didn't. Error was " + report.formatError()); + Assert.assertNull(report.getErrorMessage(), "Report succeeded but an error message was found"); + Assert.assertNull(report.getErrorThrown(), "Report succeeded but an thrown error was found"); + try { + final GATKRunReport deserialized = GATKRunReport.deserializeReport(downloaderAccessKey, downloaderSecretKey, report.getS3ReportBucket(), s3Object); + Assert.assertEquals(report, deserialized); + deleteFromS3(report); + } catch ( Exception e ) { + Assert.fail("Failed to read, deserialize, or delete GATK report " + s3Object.getName() + " with exception " + e); + } + } else { + Assert.assertNull(s3Object, "AWS upload should have failed for mode " + awsMode + " but got non-null s3 object back " + s3Object + " error was " + report.formatError()); + Assert.assertTrue(report.exceptionOccurredDuringPost(), "S3 object was null but the report says that the upload succeeded"); + Assert.assertNotNull(report.getErrorMessage(), "Report succeeded but an error message wasn't found"); + if ( awsMode == GATKRunReport.AWSMode.FAIL_WITH_EXCEPTION ) + Assert.assertNotNull(report.getErrorThrown()); + } + } + + private void deleteFromS3(final GATKRunReport report) throws Exception { + final S3Service s3Service = GATKRunReport.initializeAWSService(downloaderAccessKey, downloaderSecretKey); + // Retrieve the whole data object we created previously + s3Service.deleteObject(report.getS3ReportBucket(), report.getReportFileName()); + } + + @DataProvider(name = "PostReportByType") + public Object[][] makePostReportByType() { + List tests = new ArrayList(); + + for ( final GATKRunReport.PhoneHomeOption et : GATKRunReport.PhoneHomeOption.values() ) { + tests.add(new Object[]{et}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = ! DEBUG, dataProvider = "PostReportByType", timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) + public void testPostReportByType(final GATKRunReport.PhoneHomeOption type) { + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING); + Assert.assertFalse(report.exceptionOccurredDuringPost(), "An exception occurred during posting the report"); + final boolean succeeded = report.postReport(type); + + if ( type == GATKRunReport.PhoneHomeOption.NO_ET ) + Assert.assertFalse(succeeded, "NO_ET option shouldn't write a report"); + else { + Assert.assertTrue(succeeded, "Any non NO_ET option should succeed in writing a report"); + + if ( type == GATKRunReport.PhoneHomeOption.STDOUT ) { + // nothing to do + } else { + // must have gone to AWS + try { + Assert.assertTrue(report.wentToAWS(), "The report should have gone to AWS but the report says it wasn't"); + deleteFromS3(report); + } catch ( Exception e ) { + Assert.fail("Failed delete GATK report " + report.getReportFileName() + " with exception " + e); + } + } + } + } + + public interface S3Op { + public void apply() throws ServiceException; + } + + // Will fail with timeout if AWS time out isn't working + // Will fail with exception if AWS doesn't protect itself from errors + @Test(timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) + public void testAWSPublicKeyHasAccessControls() throws Exception { + final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING); + report.sendAWSToTestBucket(); + final S3Object s3Object = report.postReportToAWSS3(); + Assert.assertNotNull(s3Object, "Upload to AWS failed, s3Object was null. error was " + report.formatError()); + + // create a service with the public key, and make sure it cannot list or delete + final S3Service s3Service = GATKRunReport.initializeAWSService(GATKRunReport.getAWSUploadAccessKey(), GATKRunReport.getAWSUploadSecretKey()); + assertOperationNotAllowed("listAllBuckets", new S3Op() { + @Override + public void apply() throws S3ServiceException { + s3Service.listAllBuckets(); + } + }); + assertOperationNotAllowed("listBucket", new S3Op() { + @Override + public void apply() throws S3ServiceException { s3Service.listObjects(report.getS3ReportBucket()); } + }); + assertOperationNotAllowed("createBucket", new S3Op() { + @Override + public void apply() throws S3ServiceException { s3Service.createBucket("ShouldNotCreate"); } + }); + assertOperationNotAllowed("deleteObject", new S3Op() { + @Override + public void apply() throws ServiceException { s3Service.deleteObject(report.getS3ReportBucket(), report.getReportFileName()); } + }); + } + + private void assertOperationNotAllowed(final String name, final S3Op op) { + try { + op.apply(); + // only gets here if the operation was successful + Assert.fail("Operation " + name + " ran successfully but we expected to it fail"); + } catch ( ServiceException e ) { + Assert.assertEquals(e.getErrorCode(), "AccessDenied"); + } + } + + class RunReportDummyReadWalker extends ReadWalker { + @Override + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class RunReportDummyLocusWalker extends LocusWalker { + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class RunReportDummyRodWalker extends RodWalker { + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + class RunReportDummyActiveRegionWalker extends ActiveRegionWalker { + @Override + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return new ActivityProfileState(ref.getLocus(), 0.0); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/PedReaderUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleDBUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/samples/SampleUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java new file mode 100644 index 000000000..b32a3db63 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java @@ -0,0 +1,116 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; + +import java.util.*; + +/** + * ActiveRegionWalker for unit testing + * + * User: depristo + * Date: 1/15/13 + * Time: 1:28 PM + */ +class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + private EnumSet states = super.desiredReadStates(); + private GenomeLocSortedSet activeRegions = null; + + protected List isActiveCalls = new ArrayList(); + protected Map mappedActiveRegions = new LinkedHashMap(); + private boolean declareHavingPresetRegions = false; + + public DummyActiveRegionWalker() { + this(1.0); + } + + public DummyActiveRegionWalker(double constProb) { + this.prob = constProb; + } + + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, EnumSet wantStates, final boolean declareHavingPresetRegions) { + this(activeRegions, declareHavingPresetRegions); + this.states = wantStates; + } + + public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, final boolean declareHavingPresetRegions) { + this(1.0); + this.activeRegions = activeRegions; + this.declareHavingPresetRegions = declareHavingPresetRegions; + } + + public void setStates(EnumSet states) { + this.states = states; + } + + @Override + public boolean hasPresetActiveRegions() { + return declareHavingPresetRegions; + } + + @Override + public GenomeLocSortedSet getPresetActiveRegions() { + return declareHavingPresetRegions ? activeRegions : null; + } + + @Override + public EnumSet desiredReadStates() { + return states; + } + + @Override + public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); + final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; + return new ActivityProfileState(ref.getLocus(), p); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCacheUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java new file mode 100644 index 000000000..b8b9c75f1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java @@ -0,0 +1,678 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import com.google.java.contract.PreconditionError; +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.filters.ReadFilter; +import org.broadinstitute.gatk.engine.iterators.ReadTransformer; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; +import org.broadinstitute.gatk.utils.sam.*; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.executive.WindowMaker; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/13/12 + * Time: 2:47 PM + * + * Test the Active Region Traversal Contract + * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract + */ +public class TraverseActiveRegionsUnitTest extends BaseTest { + private final static boolean ENFORCE_CONTRACTS = false; + private final static boolean DEBUG = false; + + @DataProvider(name = "TraversalEngineProvider") + public Object[][] makeTraversals() { + final List traversals = new LinkedList(); + traversals.add(new Object[]{new TraverseActiveRegions<>()}); + return traversals.toArray(new Object[][]{}); + } + + private IndexedFastaSequenceFile reference; + private SAMSequenceDictionary dictionary; + private GenomeLocParser genomeLocParser; + + private List intervals; + + private File testBAM; + + @BeforeClass + private void init() throws IOException { + //reference = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); // hg19Reference)); + reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); + dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + + // TODO: reads with indels + // TODO: reads which span many regions + // TODO: reads which are partially between intervals (in/outside extension) + // TODO: duplicate reads + // TODO: read at the end of a contig + // TODO: reads which are completely outside intervals but within extension + // TODO: test the extension itself + // TODO: unmapped reads + + intervals = new ArrayList(); + intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); + intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); + + List reads = new ArrayList(); + reads.add(buildSAMRecord("simple", "1", 100, 200)); + reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); + reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); + reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); + reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); + reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000)); + reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); + reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); + reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); + reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); + reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); + reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); + + createBAM(reads); + } + + private void createBAM(List reads) throws IOException { + testBAM = createTempFile("TraverseActiveRegionsUnitTest", ".bam"); + + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); + for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { + out.addAlignment(read); + } + out.close(); + + new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testAllBasesSeen(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + List activeIntervals = getIsActiveIntervals(t, walker, intervals); + // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call + verifyEqualIntervals(intervals, activeIntervals); + } + + private List getIsActiveIntervals(final TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { + List activeIntervals = new ArrayList(); + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) { + t.traverse(walker, dataProvider, 0); + activeIntervals.addAll(walker.isActiveCalls); + } + + return activeIntervals; + } + + @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class) + public void testIsActiveRangeLow (TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); + getActiveRegions(t, walker, intervals).values(); + } + + @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class) + public void testIsActiveRangeHigh (TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); + getActiveRegions(t, walker, intervals).values(); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testActiveRegionCoverage(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), true); + + Collection activeRegions = getActiveRegions(t, walker, intervals).values(); + verifyActiveRegionCoverage(intervals, activeRegions); + } + + private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { + List intervalStarts = new ArrayList(); + List intervalStops = new ArrayList(); + + for (GenomeLoc interval : intervals) { + intervalStarts.add(interval.getStartLocation()); + intervalStops.add(interval.getStopLocation()); + } + + Map baseRegionMap = new HashMap(); + + for (ActiveRegion activeRegion : activeRegions) { + for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { + // Contract: Regions do not overlap + Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); + baseRegionMap.put(activeLoc, activeRegion); + } + + GenomeLoc start = activeRegion.getLocation().getStartLocation(); + if (intervalStarts.contains(start)) + intervalStarts.remove(start); + + GenomeLoc stop = activeRegion.getLocation().getStopLocation(); + if (intervalStops.contains(stop)) + intervalStops.remove(stop); + } + + for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { + // Contract: Each location in the interval(s) is in exactly one region + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); + baseRegionMap.remove(baseLoc); + } + + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); + + // Contract: All explicit interval boundaries must also be region boundaries + Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); + Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testActiveRegionExtensionOnContig(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + Collection activeRegions = getActiveRegions(t, walker, intervals).values(); + for (ActiveRegion activeRegion : activeRegions) { + GenomeLoc loc = activeRegion.getExtendedLoc(); + + // Contract: active region extensions must stay on the contig + Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); + int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); + Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); + } + } + + @Test(enabled = true && !DEBUG, dataProvider = "TraversalEngineProvider") + public void testPrimaryReadMapping(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY), + true); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(t, walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_unequal", "boundary_1_pre", "boundary_equal", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testNonPrimaryReadMapping(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + true); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(t, walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testExtendedReadMapping(TraverseActiveRegions t) { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED), + true); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // simple20: Primary in 20:10000-10100 + + Map activeRegions = getActiveRegions(t, walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); + verifyReadMapping(region, "simple20"); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") + public void testUnmappedReads(TraverseActiveRegions t) { + // TODO + } + + private void verifyReadMapping(ActiveRegion region, String... reads) { + Assert.assertNotNull(region, "Region was unexpectedly null"); + final Set regionReads = new HashSet(); + for (SAMRecord read : region.getReads()) { + Assert.assertFalse(regionReads.contains(read.getReadName()), "Duplicate reads detected in region " + region + " read " + read.getReadName()); + regionReads.add(read.getReadName()); + } + + Collection wantReads = new ArrayList(Arrays.asList(reads)); + for (SAMRecord read : region.getReads()) { + String regionReadName = read.getReadName(); + Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " incorrectly assigned to active region " + region); + wantReads.remove(regionReadName); + } + + Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region + ", wanted " + (wantReads.isEmpty() ? "" : wantReads.iterator().next())); + } + + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { + return getActiveRegions(t, walker, intervals, testBAM); + } + + private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final File bam) { + for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) + t.traverse(walker, dataProvider, 0); + + return walker.mappedActiveRegions; + } + + private Collection toSingleBaseLocs(GenomeLoc interval) { + List bases = new ArrayList(); + if (interval.size() == 1) + bases.add(interval); + else { + for (int location = interval.getStart(); location <= interval.getStop(); location++) + bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); + } + + return bases; + } + + private Collection toSingleBaseLocs(List intervals) { + Set bases = new TreeSet(); // for sorting and uniqueness + for (GenomeLoc interval : intervals) + bases.addAll(toSingleBaseLocs(interval)); + + return bases; + } + + private void verifyEqualIntervals(List aIntervals, List bIntervals) { + Collection aBases = toSingleBaseLocs(aIntervals); + Collection bBases = toSingleBaseLocs(bIntervals); + + Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); + + Iterator aIter = aBases.iterator(); + Iterator bIter = bBases.iterator(); + while (aIter.hasNext() && bIter.hasNext()) { + GenomeLoc aLoc = aIter.next(); + GenomeLoc bLoc = bIter.next(); + Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); + } + } + + // copied from LocusViewTemplate + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { + SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); + header.setSequenceDictionary(dictionary); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(dictionary.getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + + Cigar cigar = new Cigar(); + int len = alignmentEnd - alignmentStart + 1; + cigar.add(new CigarElement(len, CigarOperator.M)); + record.setCigar(cigar); + record.setReadString(new String(new char[len]).replace("\0", "A")); + record.setBaseQualities(new byte[len]); + record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test"))); + + return record; + } + + private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, File bamFile) { + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + + Collection samFiles = new ArrayList(); + SAMReaderID readerID = new SAMReaderID(bamFile, new Tags()); + samFiles.add(readerID); + + SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, + false, + ValidationStringency.STRICT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + new ArrayList(), + false, (byte)30, false, true, null, IntervalMergingRule.ALL); + + engine.setReadsDataSource(dataSource); + final Set samples = ReadUtils.getSAMFileSamples(dataSource.getHeader()); + + traverseActiveRegions.initialize(engine, walker); + List providers = new ArrayList(); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } + } + + return providers; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Combinatorial tests to ensure reads are going into the right regions + // + // --------------------------------------------------------------------------------------------------------- + + @DataProvider(name = "CombinatorialARTTilingProvider") + public Object[][] makeCombinatorialARTTilingProvider() { + final List tests = new LinkedList(); + + final List starts = Arrays.asList( + 1, // very start of the chromosome + ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary + ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary + ); + + final List> allReadStates = Arrays.asList( + EnumSet.of(ActiveRegionReadState.PRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), + EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED) + ); + + final int maxTests = Integer.MAX_VALUE; + int nTests = 0; + for ( final int readLength : Arrays.asList(100) ) { + for ( final int skips : Arrays.asList(0, 10) ) { + for ( final int start : starts ) { + for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) { + for ( final int nLoci : Arrays.asList(1, 1000) ) { + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setSkipNLoci(skips); + bamBuilder.setAlignmentStart(start); + for ( EnumSet readStates : allReadStates ) { + for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) { + nTests++; + if ( nTests < maxTests ) // && nTests == 1238 ) + tests.add(new Object[]{new TraverseActiveRegions<>(), nTests, activeRegions, readStates, bamBuilder}); + } + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private Collection enumerateActiveRegions(final int start, final int stop) { + // should basically cut up entire region into equal sized chunks, of + // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive + // Need to make sure we include some edge cases: + final List activeRegions = new LinkedList(); + + for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) { + for ( final boolean startWithActive : Arrays.asList(true, false) ) { + activeRegions.add(makeActiveRegionMask(start, stop, stepSize, startWithActive)); + } + } + + // active region is the whole interval + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop))); + + // active region extends up to the end of the data, but doesn't include start + activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop))); + + return activeRegions; + } + + private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) { + final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser); + + boolean includeRegion = startWithActive; + for ( int left = start; left < stop; left += stepSize) { + final int right = left + stepSize; + final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right); + if ( includeRegion ) + active.add(region); + includeRegion = ! includeRegion; + } + + return active; + } + + + @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider") + public void testARTReadsInActiveRegions(final TraverseActiveRegions traversal, final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { + logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); + walker.setStates(readStates); + + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); + + final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary + for ( final ActiveRegion region : activeRegionsMap.values() ) { + final Set readNamesInRegion = readNamesInRegion(region); + int nReadsExpectedInRegion = 0; + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); + + boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) + ? region.getExtendedLoc().overlapsP(readLoc) + : region.getLocation().overlapsP(readLoc); + + if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) { + if ( alreadySeenReads.contains(read.getReadName()) ) + shouldBeInRegion = false; + else if ( shouldBeInRegion ) + alreadySeenReads.add(read.getReadName()); + } + + String msg = readNamesInRegion.contains(read.getReadName()) == shouldBeInRegion ? "" : "Region " + region + + " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"; + Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, msg); + + nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } + } + + private Set readNamesInRegion(final ActiveRegion region) { + final Set readNames = new LinkedHashSet(region.getReads().size()); + for ( final SAMRecord read : region.getReads() ) + readNames.add(read.getReadName()); + return readNames; + } + + // --------------------------------------------------------------------------------------------------------- + // + // Make sure all insertion reads are properly included in the active regions + // + // --------------------------------------------------------------------------------------------------------- + + @Test(dataProvider = "TraversalEngineProvider", enabled = true && ! DEBUG) + public void ensureAllInsertionReadsAreInActiveRegions(final TraverseActiveRegions traversal) { + + final int readLength = 10; + final int start = 20; + final int nReadsPerLocus = 10; + final int nLoci = 3; + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); + bamBuilder.setReadLength(readLength); + bamBuilder.setAlignmentStart(start); + + // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength); + allI.setCigarString(readLength + "I"); + allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0))); + + bamBuilder.addReads(allI); + + final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser()); + activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30)); + final List intervals = Arrays.asList( + genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) + ); + + final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); + + final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); + + final ActiveRegion region = activeRegionsMap.values().iterator().next(); + int nReadsExpectedInRegion = 0; + + final Set readNamesInRegion = readNamesInRegion(region); + for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { + Assert.assertTrue(readNamesInRegion.contains(read.getReadName()), + "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't"); + nReadsExpectedInRegion++; + } + + Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java rename to public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicatesUnitTest.java diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java new file mode 100644 index 000000000..3e6b3f2f4 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java @@ -0,0 +1,167 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.traversals; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import org.broadinstitute.gatk.engine.walkers.TestCountReadsWalker; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.gatk.engine.datasources.reads.*; +import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; +import org.broadinstitute.gatk.engine.walkers.ReadWalker; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.testng.Assert.fail; + +/** + * + * User: aaron + * Date: Apr 24, 2009 + * Time: 3:42:16 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 24, 2009 + *

+ * Class TraverseReadsUnitTest + *

+ * test traversing reads + */ +public class TraverseReadsUnitTest extends BaseTest { + + private ReferenceSequenceFile seq; + private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam"); + private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); + private List bamList; + private ReadWalker countReadWalker; + private File output; + private TraverseReadsNano traversalEngine = null; + + private IndexedFastaSequenceFile ref = null; + private GenomeLocParser genomeLocParser = null; + private GenomeAnalysisEngine engine = null; + + @BeforeClass + public void doOnce() { + try { + ref = new CachingIndexedFastaSequenceFile(refFile); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(refFile,ex); + } + genomeLocParser = new GenomeLocParser(ref); + + engine = new GenomeAnalysisEngine(); + engine.setReferenceDataSource(refFile); + engine.setGenomeLocParser(genomeLocParser); + } + + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeMethod + public void doForEachTest() { + output = new File("testOut.txt"); + FileOutputStream out = null; + PrintStream ps; // declare a print stream object + + try { + out = new FileOutputStream(output); + } catch (FileNotFoundException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + fail("Couldn't open the output file"); + } + + bamList = new ArrayList(); + bamList.add(bam); + countReadWalker = new TestCountReadsWalker(); + + traversalEngine = new TraverseReadsNano(1); + traversalEngine.initialize(engine, countReadWalker); + } + + /** Test out that we can shard the file and iterate over every read */ + @Test + public void testUnmappedReadCount() { + SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); + Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + + countReadWalker.initialize(); + Object accumulator = countReadWalker.reduceInit(); + + for(Shard shard: shardStrategy) { + if (shard == null) { + fail("Shard == null"); + } + + ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null, Collections.emptyList()); + accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); + dataProvider.close(); + } + + countReadWalker.onTraversalDone(accumulator); + + if (!(accumulator instanceof Long)) { + fail("Count read walker should return a Long."); + } + if (!accumulator.equals(new Long(10000))) { + fail("there should be 10000 mapped reads in the index file, there was " + (accumulator)); + } + } + +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountLociWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountLociWalker.java new file mode 100644 index 000000000..8b7a8d758 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountLociWalker.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; + +import java.io.PrintStream; + +public class TestCountLociWalker extends LocusWalker { + @Output + private PrintStream out; + + @Override + public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + return 1; + } + + @Override + public Long reduceInit() { + return 0L; + } + + @Override + public Long reduce(final Integer value, final Long sum) { + return value + sum; + } + + @Override + public void onTraversalDone(final Long result) { + out.println(result); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountReadsWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountReadsWalker.java new file mode 100644 index 000000000..cc0162fc1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestCountReadsWalker.java @@ -0,0 +1,59 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; + +public class TestCountReadsWalker extends ReadWalker { + @Output + PrintStream out; + + @Override + public Integer map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) { + return 1; + } + + @Override + public Long reduceInit() { + return 0L; + } + + @Override + public Long reduce(final Integer value, final Long sum) { + return value + sum; + } + + @Override + public void onTraversalDone(final Long result) { + if (out != null) + out.println(result); + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestErrorThrowingWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestErrorThrowingWalker.java new file mode 100644 index 000000000..00774f7b7 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestErrorThrowingWalker.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Input; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; + +public class TestErrorThrowingWalker extends RefWalker implements TreeReducible, NanoSchedulable { + @Input(fullName = "exception", shortName = "E", doc = "Java class of exception to throw", required = true) + public String exceptionToThrow; + + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + @Override + public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if (ref == null) // only throw exception when we are in proper map, not special map(null) call + return null; + + if (failMethod == FailMethod.MAP) + FailMethod.fail(exceptionToThrow); + + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(final Integer value, final Integer sum) { + if (value != null && failMethod == FailMethod.REDUCE) + FailMethod.fail(exceptionToThrow); + return sum; + } + + @Override + public Integer treeReduce(final Integer lhs, final Integer rhs) { + if (failMethod == FailMethod.TREE_REDUCE) + FailMethod.fail(exceptionToThrow); + return rhs; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintReadsWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintReadsWalker.java new file mode 100644 index 000000000..bbf653ac1 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintReadsWalker.java @@ -0,0 +1,76 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileWriter; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.io.NWaySAMFileWriter; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +public class TestPrintReadsWalker extends ReadWalker implements NanoSchedulable { + @Output + private GATKSAMFileWriter out; + + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + public boolean NO_PG_TAG = false; + + @Override + public void initialize() { + // All for the no_pg_tag. Should this be in the engine and not in the walker? + final GenomeAnalysisEngine toolkit = getToolkit(); + final SAMFileHeader outputHeader = toolkit.getSAMFileHeader().clone(); + final String PROGRAM_RECORD_NAME = "GATK PrintReads"; + final boolean preSorted = true; + if (toolkit.getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { + NWaySAMFileWriter.setupWriter(out, toolkit, outputHeader, preSorted, this, PROGRAM_RECORD_NAME); + } else { + out.writeHeader(outputHeader); + out.setPresorted(preSorted); + } + } + + @Override + public GATKSAMRecord map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) { + return read; + } + + @Override + public SAMFileWriter reduceInit() { + return out; + } + + @Override + public SAMFileWriter reduce(final GATKSAMRecord read, final SAMFileWriter output) { + output.addAlignment(read); + return output; + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintVariantsWalker.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintVariantsWalker.java new file mode 100644 index 000000000..8af514693 --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/TestPrintVariantsWalker.java @@ -0,0 +1,99 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.*; +import org.broadinstitute.gatk.engine.GATKVCFUtils; +import org.broadinstitute.gatk.engine.SampleUtils; +import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; + +import java.util.*; + +public class TestPrintVariantsWalker extends RodWalker implements TreeReducible { + @ArgumentCollection + private StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @Argument(fullName = "fullyDecode", doc = "If true, the incoming VariantContext will be fully decoded", required = false) + private boolean fullyDecode = false; + + @Output + private VariantContextWriter vcfWriter = null; + + private Map vcfRods = null; + + @Override + public void initialize() { + vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY)); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + } + + @Override + public Integer map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if (tracker == null) + return 0; + final Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); + for (VariantContext vc : vcs) { + if (fullyDecode) + vc = vc.fullyDecode(vcfRods.get(vc.getSource()), getToolkit().lenientVCFProcessing()); + vcfWriter.add(vc); + } + return vcs.isEmpty() ? 0 : 1; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(final Integer counter, final Integer sum) { + return counter + sum; + } + + @Override + public Integer treeReduce(final Integer lhs, final Integer rhs) { + return reduce(lhs, rhs); + } + + @Override + public void onTraversalDone(final Integer sum) { + } +} diff --git a/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java new file mode 100644 index 000000000..ff6b1242f --- /dev/null +++ b/public/gatk-engine/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java @@ -0,0 +1,457 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.engine.walkers; + +import htsjdk.tribble.Tribble; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.IndexFactory; +import htsjdk.variant.bcf2.BCF2Utils; +import htsjdk.variant.vcf.VCFCodec; +import org.apache.commons.lang.StringUtils; +import org.broadinstitute.gatk.engine.CommandLineExecutable; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.crypt.CryptUtils; +import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.MD5DB; +import org.broadinstitute.gatk.utils.MD5Mismatch; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.testng.Assert; +import org.testng.annotations.AfterSuite; +import org.testng.annotations.BeforeMethod; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; +import java.text.SimpleDateFormat; +import java.util.*; + +public class WalkerTest extends BaseTest { + public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; + + private static final boolean GENERATE_SHADOW_BCF = true; + private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; + private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false; + private static final boolean ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS = false; + + private static MD5DB md5DB = new MD5DB(); + + @BeforeMethod + public void initializeWalkerTests() { + logger.debug("Initializing walker tests"); + Utils.resetRandomGenerator(); + } + + @AfterSuite + public void finalizeWalkerTests() { + logger.debug("Finalizing walker tests"); + md5DB.close(); + } + + public static MD5DB getMd5DB() { + return md5DB; + } + + public void validateOutputBCFIfPossible(final String name, final File resultFile) { + final File bcfFile = BCF2Utils.shadowBCF(resultFile); + if ( bcfFile != null && bcfFile.exists() ) { + logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile); + try { + assertVCFandBCFFilesAreTheSame(resultFile, bcfFile); + logger.warn(" Shadow BCF PASSED!"); + } catch ( Exception e ) { + Assert.fail("Exception received reading shadow BCFFile " + bcfFile + " for test " + name, e); + } + } + } + + public void validateOutputIndex(final String name, final File resultFile) { + if ( !ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX ) + return; + + File indexFile = Tribble.indexFile(resultFile); + //System.out.println("Putative index file is " + indexFile); + if ( indexFile.exists() ) { + if ( resultFile.getAbsolutePath().contains(".vcf") ) { + // todo -- currently we only understand VCF files! Blow up since we can't test them + throw new GATKException("Found an index created for file " + resultFile + " but we can only validate VCF files. Extend this code!"); + } + + System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile); + Index indexFromOutputFile = IndexFactory.createDynamicIndex(resultFile, new VCFCodec()); + Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) { + Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", + indexFromOutputFile.getProperties(), + dynamicIndex.getProperties())); + } + } + } + + public List assertMatchingMD5s(final String testName, final String testClassName, List resultFiles, List expectedMD5s) { + List md5s = new ArrayList(); + List fails = new ArrayList(); + + for (int i = 0; i < resultFiles.size(); i++) { + MD5DB.MD5Match result = getMd5DB().testFileMD5(testName, testClassName, resultFiles.get(i), expectedMD5s.get(i), parameterize()); + validateOutputBCFIfPossible(testName, resultFiles.get(i)); + if ( ! result.failed ) { + validateOutputIndex(testName, resultFiles.get(i)); + md5s.add(result.expectedMD5); + } else { + fails.add(result); + } + } + + if ( ! fails.isEmpty() ) { + List actuals = new ArrayList(); + List expecteds = new ArrayList(); + List diffEngineOutputs = new ArrayList(); + + for ( final MD5DB.MD5Match fail : fails ) { + actuals.add(fail.actualMD5); + expecteds.add(fail.expectedMD5); + diffEngineOutputs.add(fail.diffEngineOutput); + logger.warn("Fail: " + fail.failMessage); + } + + final MD5Mismatch failure = new MD5Mismatch(actuals, expecteds, diffEngineOutputs); + Assert.fail(failure.toString()); + } + + return md5s; + } + + public String buildCommandLine(String... arguments) { + String cmdline = ""; + + for ( int argIndex = 0; argIndex < arguments.length; argIndex++ ) { + cmdline += arguments[argIndex]; + + if (argIndex < arguments.length - 1) { + cmdline += " "; + } + } + + return cmdline; + } + + public class WalkerTestSpec { + // Arguments implicitly included in all Walker command lines, unless explicitly + // disabled using the disableImplicitArgs() method below. + String args = ""; + int nOutputFiles = -1; + List md5s = null; + List exts = null; + Class expectedException = null; + boolean includeImplicitArgs = true; + boolean includeShadowBCF = true; + + // Name of the test class that created this test case + private Class testClass; + + // the default output path for the integration test + private File outputFileLocation = null; + + protected Map auxillaryFiles = new HashMap(); + + public WalkerTestSpec(String args, List md5s) { + this(args, -1, md5s); + } + + public WalkerTestSpec(String args, int nOutputFiles, List md5s) { + this.args = args; + this.nOutputFiles = md5s.size(); + this.md5s = md5s; + this.testClass = getCallingTestClass(); + } + + public WalkerTestSpec(String args, List exts, List md5s) { + this(args, -1, exts, md5s); + } + + public WalkerTestSpec(String args, int nOutputFiles, List exts, List md5s) { + this.args = args; + this.nOutputFiles = md5s.size(); + this.md5s = md5s; + this.exts = exts; + this.testClass = getCallingTestClass(); + } + + // @Test(expectedExceptions) doesn't work in integration tests, so use this instead + public WalkerTestSpec(String args, int nOutputFiles, Class expectedException) { + this.args = args; + this.nOutputFiles = nOutputFiles; + this.expectedException = expectedException; + this.testClass = getCallingTestClass(); + } + + private Class getCallingTestClass() { + return JVMUtils.getCallingClass(getClass()); + } + + public String getTestClassName() { + return testClass.getSimpleName(); + } + + public String getArgsWithImplicitArgs() { + String args = this.args; + if ( includeImplicitArgs ) { + args = args + (ENABLE_PHONE_HOME_FOR_TESTS ? + String.format(" -et %s ", GATKRunReport.PhoneHomeOption.AWS) : + String.format(" -et %s -K %s ", GATKRunReport.PhoneHomeOption.NO_ET, gatkKeyFile)); + if ( includeShadowBCF && GENERATE_SHADOW_BCF ) + args = args + " --generateShadowBCF "; + if ( ! ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS ) + args = args + " --disable_auto_index_creation_and_locking_when_reading_rods "; + } + + return args; + } + + /** + * In the case where the input VCF files are malformed and cannot be fixed + * this function tells the engine to not try to generate a shadow BCF + * which will ultimately blow up... + */ + public void disableShadowBCF() { this.includeShadowBCF = false; } + public void setOutputFileLocation(File outputFileLocation) { + this.outputFileLocation = outputFileLocation; + } + + protected File getOutputFileLocation() { + return outputFileLocation; + } + + public boolean expectsException() { + return expectedException != null; + } + + public Class getExpectedException() { + if ( ! expectsException() ) throw new ReviewedGATKException("Tried to get expection for walker test that doesn't expect one"); + return expectedException; + } + + public void addAuxFile(String expectededMD5sum, File outputfile) { + auxillaryFiles.put(expectededMD5sum, outputfile); + } + + public void disableImplicitArgs() { + includeImplicitArgs = false; + } + } + + protected boolean parameterize() { + return false; + } + + public enum ParallelTestType { + TREE_REDUCIBLE, + NANO_SCHEDULED, + BOTH + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) { + final List ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + final List cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + + return executeTest(name, spec, ntThreads, cntThreads); + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { + return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE); + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { + String originalArgs = spec.args; + Pair, List> results = null; + + boolean ran1 = false; + for ( int nt : ntThreads ) { + String extra = nt == 1 ? "" : (" -nt " + nt); + ran1 = ran1 || nt == 1; + spec.args = originalArgs + extra; + results = executeTest(name + "-nt-" + nt, spec); + } + + for ( int nct : cpuThreads ) { + if ( nct != 1 ) { + String extra = " -nct " + nct; + spec.args = originalArgs + extra; + results = executeTest(name + "-cnt-" + nct, spec); + } + } + + return results; + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec) { + List tmpFiles = new ArrayList(); + for (int i = 0; i < spec.nOutputFiles; i++) { + String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i); + File fl = createTempFile(String.format("walktest.tmp_param.%d", i), ext); + + // Cleanup any potential shadow BCFs on exit too, if we're generating them + if ( spec.includeShadowBCF && GENERATE_SHADOW_BCF ) { + final File potentalShadowBCFFile = BCF2Utils.shadowBCF(fl); + potentalShadowBCFFile.deleteOnExit(); + new File(potentalShadowBCFFile.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION).deleteOnExit(); + } + + tmpFiles.add(fl); + } + + final String args = String.format(spec.getArgsWithImplicitArgs(), tmpFiles.toArray()); + System.out.println(Utils.dupString('-', 80)); + + if ( spec.expectsException() ) { + // this branch handles the case were we are testing that a walker will fail as expected + return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), null, tmpFiles, args, spec.getExpectedException()); + } else { + List md5s = new LinkedList(); + md5s.addAll(spec.md5s); + + // check to see if they included any auxillary files, if so add them to the list and set them to be deleted on exit + for (String md5 : spec.auxillaryFiles.keySet()) { + md5s.add(md5); + final File auxFile = spec.auxillaryFiles.get(md5); + auxFile.deleteOnExit(); + tmpFiles.add(auxFile); + } + return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), md5s, tmpFiles, args, null); + } + } + + private void qcMD5s(String name, List md5s) { + final String exampleMD5 = "709a1f482cce68992c637da3cff824a8"; + for (String md5 : md5s) { + if ( md5 == null ) + throw new IllegalArgumentException("Null MD5 found in test " + name); + if ( md5.equals("") ) // ok + continue; + if ( ! StringUtils.isAlphanumeric(md5) ) + throw new IllegalArgumentException("MD5 contains non-alphanumeric characters test " + name + " md5=" + md5); + if ( md5.length() != exampleMD5.length() ) + throw new IllegalArgumentException("Non-empty MD5 of unexpected number of characters test " + name + " md5=" + md5); + } + } + + + /** + * execute the test, given the following: + * @param testName the name of the test + * @param testClassName the name of the class that contains the test + * @param md5s the list of md5s + * @param tmpFiles the temp file corresponding to the md5 list + * @param args the argument list + * @param expectedException the expected exception or null + * @return a pair of file and string lists + */ + private Pair, List> executeTest(String testName, String testClassName, File outputFileLocation, List md5s, List tmpFiles, String args, Class expectedException) { + if ( md5s != null ) qcMD5s(testName, md5s); + + if (outputFileLocation != null) + args += " -o " + outputFileLocation.getAbsolutePath(); + executeTest(testName, testClassName, args, expectedException); + + if ( expectedException != null ) { + return null; + } else { + // we need to check MD5s + return new Pair, List>(tmpFiles, assertMatchingMD5s(testName, testClassName, tmpFiles, md5s)); + } + } + + /** + * execute the test, given the following: + * @param testName the name of the test + * @param testClassName the name of the class that contains the test + * @param args the argument list + * @param expectedException the expected exception or null + */ + private void executeTest(String testName, String testClassName, String args, Class expectedException) { + CommandLineGATK instance = new CommandLineGATK(); + String[] command = Utils.escapeExpressions(args); + // run the executable + boolean gotAnException = false; + try { + final String now = new SimpleDateFormat("HH:mm:ss").format(new Date()); + final String cmdline = Utils.join(" ",command); + System.out.println(String.format("[%s] Executing test %s:%s with GATK arguments: %s", now, testClassName, testName, cmdline)); + // also write the command line to the HTML log for convenient follow-up + // do the replaceAll so paths become relative to the current + BaseTest.log(cmdline.replaceAll(publicTestDirRoot, "").replaceAll(privateTestDirRoot, "")); + CommandLineExecutable.start(instance, command); + } catch (Exception e) { + gotAnException = true; + if ( expectedException != null ) { + // we expect an exception + //System.out.println(String.format("Wanted exception %s, saw %s", expectedException, e.getClass())); + if ( expectedException.isInstance(e) ) { + // it's the type we expected + //System.out.println(String.format(" => %s PASSED", name)); + } else { + final String message = String.format("Test %s:%s expected exception %s but instead got %s with error message %s", + testClassName, testName, expectedException, e.getClass(), e.getMessage()); + if ( e.getCause() != null ) { + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final PrintStream ps = new PrintStream(baos); + e.getCause().printStackTrace(ps); + BaseTest.log(message); + BaseTest.log(baos.toString()); + } + Assert.fail(message); + } + } else { + // we didn't expect an exception but we got one :-( + throw new RuntimeException(e); + } + } + + // catch failures from the integration test + if ( expectedException != null ) { + if ( ! gotAnException ) + // we expected an exception but didn't see it + Assert.fail(String.format("Test %s:%s expected exception %s but none was thrown", testClassName, testName, expectedException.toString())); + } else { + if ( CommandLineExecutable.result != 0) { + throw new RuntimeException("Error running the GATK with arguments: " + args); + } + } + } + + + protected File createTempFileFromBase(final String name) { + File fl = new File(name); + fl.deleteOnExit(); + return fl; + } +} diff --git a/public/gatk-queue-extensions-generator/pom.xml b/public/gatk-queue-extensions-generator/pom.xml index d1c75c578..9799191a8 100644 --- a/public/gatk-queue-extensions-generator/pom.xml +++ b/public/gatk-queue-extensions-generator/pom.xml @@ -21,7 +21,7 @@ ${project.groupId} - gatk-tools-public + gatk-engine ${project.version} diff --git a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java index 1e9e5cc45..1a6cda658 100644 --- a/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/gatk-queue-extensions-generator/src/main/java/org/broadinstitute/gatk/queue/extensions/gatk/ArgumentDefinitionField.java @@ -507,7 +507,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { @Override protected String getFreezeFields() { return String.format( ("if (%2$s != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(%2$s))%n" + - " if (!org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor.isCompressed(%2$s.getPath))%n" + + " if (!org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor.isCompressed(%2$s.getPath))%n" + " %1$s = new File(%2$s.getPath + \"%3$s\")%n"), auxFieldName, originalFieldName, Tribble.STANDARD_INDEX_EXTENSION); } diff --git a/public/gatk-queue-extensions-public/pom.xml b/public/gatk-queue-extensions-public/pom.xml index a48c3e6e1..b2d833d50 100644 --- a/public/gatk-queue-extensions-public/pom.xml +++ b/public/gatk-queue-extensions-public/pom.xml @@ -39,6 +39,10 @@ log4j log4j + + picard + picard + ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} test-jar test diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala index 36031d948..f116af51a 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/cancer/MuTect.scala @@ -32,6 +32,7 @@ import org.broadinstitute.gatk.utils.commandline.Input import org.broadinstitute.gatk.utils.commandline.Output import org.broadinstitute.gatk.queue.function.scattergather.ScatterGatherableFunction import org.broadinstitute.gatk.queue.extensions.gatk.{TaggedFile, VcfGatherFunction, LocusScatterFunction} +import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor.isCompressed class MuTect extends org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGATK with ScatterGatherableFunction { analysisName = "MuTect" @@ -409,7 +410,7 @@ class MuTect extends org.broadinstitute.gatk.queue.extensions.gatk.CommandLineGA override def freezeFieldValues() { super.freezeFieldValues() if (vcf != null && !org.broadinstitute.gatk.utils.io.IOUtils.isSpecialFile(vcf)) - if (!org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor.isCompressed(vcf.getPath)) + if (!org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor.isCompressed(vcf.getPath)) vcfIndex = new File(vcf.getPath + ".idx") dbsnpIndexes ++= dbsnp.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) cosmicIndexes ++= cosmic.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala index fd54be631..0bedbf543 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/DoC/package.scala @@ -27,7 +27,6 @@ package org.broadinstitute.gatk.queue.extensions.gatk import java.io.{PrintStream, PrintWriter, File} import org.broadinstitute.gatk.queue.function.scattergather.ScatterGatherableFunction -import org.broadinstitute.gatk.engine.downsampling.DownsampleType import org.broadinstitute.gatk.utils.commandline.{Input, Gather, Output} import org.broadinstitute.gatk.queue.function.{InProcessFunction, CommandLineFunction} import org.broadinstitute.gatk.tools.walkers.coverage.CoverageUtils @@ -35,6 +34,7 @@ import scala.collection.JavaConversions._ import scala.Some import org.broadinstitute.gatk.utils.text.XReadLines import org.broadinstitute.gatk.queue.util.VCF_BAM_utilities +import org.broadinstitute.gatk.utils.downsampling.DownsampleType // Minimal refactor from a package object to a file full of classes/objects // due to ongoing bugs with inner classes/objects in package objects: diff --git a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala index 0e568b61f..c1d71e281 100644 --- a/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala +++ b/public/gatk-queue-extensions-public/src/main/scala/org/broadinstitute/gatk/queue/extensions/gatk/GATKIntervals.scala @@ -59,7 +59,7 @@ case class GATKIntervals(reference: File, intervals: Seq[File], intervalsString: this.excludeIntervalsString.map(GATKIntervals.createBinding(_, "excludeIntervalsString")) IntervalUtils.parseIntervalBindings( - referenceDataSource, + referenceDataSource.getReference, includeIntervalBindings, intervalSetRule, intervalMergingRule, intervalPadding.getOrElse(0), excludeIntervalBindings).toList diff --git a/public/gatk-queue/pom.xml b/public/gatk-queue/pom.xml index be329a738..525f0b59a 100644 --- a/public/gatk-queue/pom.xml +++ b/public/gatk-queue/pom.xml @@ -21,7 +21,7 @@ ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} @@ -47,7 +47,7 @@ ${project.groupId} - gatk-tools-public + gatk-utils ${project.version} test-jar test diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala index be5a17f43..ddc11eb34 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobReport.scala @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.queue.util import org.broadinstitute.gatk.queue.function.QFunction -import org.broadinstitute.gatk.engine.report.GATKReportTable import org.broadinstitute.gatk.queue.engine.JobRunInfo +import org.broadinstitute.gatk.utils.report.GATKReportTable /** * A mixin to add Job info to the class diff --git a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala index b3b0b33c8..082062364 100644 --- a/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala +++ b/public/gatk-queue/src/main/scala/org/broadinstitute/gatk/queue/util/QJobsReporter.scala @@ -30,9 +30,10 @@ import org.broadinstitute.gatk.utils.io.{Resource} import org.broadinstitute.gatk.queue.engine.{JobRunInfo, QGraph} import org.broadinstitute.gatk.queue.function.QFunction import org.broadinstitute.gatk.utils.R.{RScriptLibrary, RScriptExecutor} -import org.broadinstitute.gatk.engine.report.{GATKReportTable, GATKReport} +import org.broadinstitute.gatk.utils.report.GATKReportTable import org.broadinstitute.gatk.utils.exceptions.UserException import org.apache.commons.io.{FileUtils, IOUtils} +import org.broadinstitute.gatk.utils.report.{GATKReportTable, GATKReport} /** * Writes out RunInfo to a GATKReport diff --git a/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala b/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala index 62ac8e1fe..5cd25eb0c 100644 --- a/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala +++ b/public/gatk-queue/src/test/scala/org/broadinstitute/gatk/queue/pipeline/QueueTest.scala @@ -35,9 +35,9 @@ import org.broadinstitute.gatk.utils.MD5DB import org.broadinstitute.gatk.queue.{QScript, QCommandLine} import org.broadinstitute.gatk.queue.util.Logging import java.io.{FilenameFilter, File} -import org.broadinstitute.gatk.engine.report.GATKReport import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.WildcardFileFilter +import org.broadinstitute.gatk.utils.report.GATKReport object QueueTest extends BaseTest with Logging { diff --git a/public/gatk-root/pom.xml b/public/gatk-root/pom.xml index 7dbb397af..36f2b2256 100644 --- a/public/gatk-root/pom.xml +++ b/public/gatk-root/pom.xml @@ -130,6 +130,12 @@ reflections 0.9.9-RC1 + + + org.slf4j + slf4j-api + 1.6.1 + org.slf4j slf4j-log4j12 @@ -404,6 +410,7 @@ ${gatk.basedir} diff true + ${gatk.queuetests.run} ${java.io.tmpdir} @@ -458,8 +465,7 @@ ${gatk.basedir} diff true - - ${gatk.queuetests.run} + ${gatk.queuetests.run} ${java.io.tmpdir} diff --git a/public/gatk-tools-public/pom.xml b/public/gatk-tools-public/pom.xml index 709817355..0a5755026 100644 --- a/public/gatk-tools-public/pom.xml +++ b/public/gatk-tools-public/pom.xml @@ -24,6 +24,26 @@ gatk-engine ${project.version} + + org.apache.commons + commons-jexl + + + + ${project.groupId} + gatk-utils + ${project.version} + test-jar + test + + + + ${project.groupId} + gatk-engine + ${project.version} + test-jar + test + com.google.caliper diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java deleted file mode 100644 index b8221bb16..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineExecutable.java +++ /dev/null @@ -1,229 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; -import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.io.stubs.OutputStreamArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.crypt.CryptUtils; -import org.broadinstitute.gatk.utils.crypt.GATKKey; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.text.ListFileUtils; - -import java.security.PublicKey; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; - -/** - * @author aaron - */ -public abstract class CommandLineExecutable extends CommandLineProgram { - /** - * The actual engine which performs the analysis. - */ - protected GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - - // get the analysis name - public abstract String getAnalysisName(); - - /** - * Gets the GATK argument bundle. - * @return A structure consisting of whatever arguments should be used to initialize the GATK engine. - */ - protected abstract GATKArgumentCollection getArgumentCollection(); - - /** - * A list of all the arguments initially used as sources. - */ - private final Collection argumentSources = new ArrayList(); - - protected static Logger logger = Logger.getLogger(CommandLineExecutable.class); - - /** - * this is the function that the inheriting class can expect to have called - * when the command line system has initialized. - * - * @return the return code to exit the program with - */ - protected int execute() throws Exception { - engine.setParser(parser); - argumentSources.add(this); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - - try { - // Make sure a valid GATK user key is present, if required. - authorizeGATKRun(); - - engine.setArguments(getArgumentCollection()); - - // File lists can require a bit of additional expansion. Set these explicitly by the engine. - final Collection bamFileList=ListFileUtils.unpackBAMFileList(getArgumentCollection().samFiles,parser); - engine.setSAMFileIDs(bamFileList); - if(getArgumentCollection().showFullBamList){ - logger.info(String.format("Adding the following input SAM Files: %s",bamFileList.toString())); - } - - engine.setWalker(walker); - walker.setToolkit(engine); - - Collection filters = engine.createFilters(); - engine.setFilters(filters); - - // load the arguments into the walker / filters. - // TODO: The fact that this extra load call exists here when all the parsing happens at the engine - // TODO: level indicates that we're doing something wrong. Turn this around so that the GATK can drive - // TODO: argument processing. - loadArgumentsIntoObject(walker); - argumentSources.add(walker); - - Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser); - engine.setReferenceMetaDataFiles(rodBindings); - - for (ReadFilter filter: filters) { - loadArgumentsIntoObject(filter); - argumentSources.add(filter); - } - - engine.execute(); - generateGATKRunReport(walker); - } catch ( Exception e ) { - generateGATKRunReport(walker, e); - throw e; - } - - // always return 0 - return 0; - } - - /** - * Authorizes this run of the GATK by checking for a valid GATK user key, if required. - * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. - */ - private void authorizeGATKRun() { - if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || - getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { - if ( getArgumentCollection().gatkKeyFile == null ) { - throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + - "Please see " + UserException.PHONE_HOME_DOCS_URL + - " for more information and instructions on how to obtain a key."); - } - else { - PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); - GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); - - if ( ! gatkUserKey.isValid() ) { - throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); - } - } - } - } - - /** - * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. - * This report will be written to either STDOUT or to the run repository, depending on the options - * for -et. - * - * @param e the exception, can be null if no exception occurred - */ - private void generateGATKRunReport(Walker walker, Exception e) { - if ( getArgumentCollection().phoneHomeType != GATKRunReport.PhoneHomeOption.NO_ET ) { - GATKRunReport report = new GATKRunReport(walker, e, engine, getArgumentCollection().phoneHomeType ); - report.postReport(getArgumentCollection().phoneHomeType); - } - } - - /** - * Convenience method for fully parameterized generateGATKRunReport when an exception has - * not occurred - * - * @param walker - */ - private void generateGATKRunReport(Walker walker) { - generateGATKRunReport(walker, null); - } - - /** - * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. - * @return A collection of type descriptors generating implementation-dependent placeholders. - */ - protected Collection getArgumentTypeDescriptors() { - return Arrays.asList( new VCFWriterArgumentTypeDescriptor(engine,System.out,argumentSources), - new SAMFileWriterArgumentTypeDescriptor(engine,System.out), - new OutputStreamArgumentTypeDescriptor(engine,System.out) ); - } - - /** - * GATK can add arguments dynamically based on analysis type. - * - * @return true - */ - @Override - protected boolean canAddArgumentsDynamically() { - return true; - } - - /** - * GATK provides the walker as an argument source. - * @return List of walkers to load dynamically. - */ - @Override - protected Class[] getArgumentSources() { - // No walker info? No plugins. - if (getAnalysisName() == null) return new Class[] {}; - - Collection argumentSources = new ArrayList(); - - Walker walker = engine.getWalkerByName(getAnalysisName()); - engine.setArguments(getArgumentCollection()); - engine.setWalker(walker); - walker.setToolkit(engine); - argumentSources.add(walker.getClass()); - - Collection filters = engine.createFilters(); - for(ReadFilter filter: filters) - argumentSources.add(filter.getClass()); - - Class[] argumentSourcesAsArray = new Class[argumentSources.size()]; - return argumentSources.toArray(argumentSourcesAsArray); - } - - @Override - protected String getArgumentSourceName( Class argumentSource ) { - return engine.getWalkerName((Class)argumentSource); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java deleted file mode 100644 index f88c413bb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/CommandLineGATK.java +++ /dev/null @@ -1,385 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import picard.PicardException; -import htsjdk.samtools.SAMException; -import htsjdk.tribble.TribbleException; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; -import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.engine.walkers.Attribution; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.help.*; -import org.broadinstitute.gatk.utils.text.TextFormattingUtils; - -import java.util.*; - -/** - * All command line parameters accepted by all tools in the GATK. - * - *

Info for general users

- * - *

This is a list of options and parameters that are generally available to all tools in the GATK.

- * - *

There may be a few restrictions, which are indicated in individual argument descriptions. For example the -BQSR - * argument is only meant to be used with a subset of tools, and the -pedigree argument will only be effectively used - * by a subset of tools as well. Some arguments conflict with others, and some conversely are dependent on others. This - * is all indicated in the detailed argument descriptions, so be sure to read those in their entirety rather than just - * skimming the one-line summaey in the table.

- * - *

Info for developers

- * - *

This class is the GATK engine itself, which manages map/reduce data access and runs walkers.

- * - *

We run command line GATK programs using this class. It gets the command line args, parses them, and hands the - * gatk all the parsed out information. Pretty much anything dealing with the underlying system should go here; - * the GATK engine should deal with any data related information.

- */ -@DocumentedGATKFeature(groupName = HelpConstants.DOCS_CAT_ENGINE) -public class CommandLineGATK extends CommandLineExecutable { - /** - * A complete list of tools (sometimes also called walkers because they "walk" through the data to perform analyses) - * is available in the online documentation. - */ - @Argument(fullName = "analysis_type", shortName = "T", doc = "Name of the tool to run") - private String analysisName = null; - - // our argument collection, the collection of command line args we accept - @ArgumentCollection - private GATKArgumentCollection argCollection = new GATKArgumentCollection(); - - /** - * Get pleasing info about the GATK. - * - * @return A list of Strings that contain pleasant info about the GATK. - */ - @Override - protected ApplicationDetails getApplicationDetails() { - return new ApplicationDetails(createApplicationHeader(), - getAttribution(), - ApplicationDetails.createDefaultRunningInstructions(getClass()), - getAdditionalHelp()); - } - - @Override - public String getAnalysisName() { - return analysisName; - } - - @Override - protected GATKArgumentCollection getArgumentCollection() { - return argCollection; - } - - /** - * Required main method implementation. - */ - public static void main(String[] argv) { - try { - CommandLineGATK instance = new CommandLineGATK(); - start(instance, argv); - System.exit(CommandLineProgram.result); // todo -- this is a painful hack - } catch (UserException e) { - exitSystemWithUserError(e); - } catch (TribbleException e) { - // We can generate Tribble Exceptions in weird places when e.g. VCF genotype fields are - // lazy loaded, so they aren't caught elsewhere and made into User Exceptions - exitSystemWithUserError(e); - } catch(PicardException e) { - // TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedGATKExceptions? - exitSystemWithError(e); - } catch (SAMException e) { - checkForMaskedUserErrors(e); - exitSystemWithSamError(e); - } catch (OutOfMemoryError e) { - exitSystemWithUserError(new UserException.NotEnoughMemory()); - } catch (Throwable t) { - checkForMaskedUserErrors(t); - exitSystemWithError(t); - } - } - - public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; - public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; - public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device"; - public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; - - private static void checkForMaskedUserErrors(final Throwable t) { - // masked out of memory error - if ( t instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); - // masked user error - if ( t instanceof UserException || t instanceof TribbleException ) - exitSystemWithUserError(new UserException(t.getMessage())); - - // no message means no masked error - final String message = t.getMessage(); - if ( message == null ) - return; - - // too many open files error - if ( message.contains("Too many open files") ) - exitSystemWithUserError(new UserException.TooManyOpenFiles()); - - // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) - exitSystemWithSamError(t); - - // can't close tribble index when writing - if ( message.contains("Unable to close index for") ) - exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); - - // disk is full - if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - - // masked error wrapped in another one - if ( t.getCause() != null ) - checkForMaskedUserErrors(t.getCause()); - } - - /** - * Creates the a short blurb about the GATK, copyright info, and where to get documentation. - * - * @return The application header. - */ - public static List createApplicationHeader() { - List header = new ArrayList(); - header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); - header.add("Copyright (c) 2010 The Broad Institute"); - header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); - return header; - } - - public static String getVersionNumber() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); - return headerInfo.containsKey("org.broadinstitute.gatk.tools.version") ? headerInfo.getString("org.broadinstitute.gatk.tools.version") : ""; - } - - public static String getBuildTime() { - ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); - return headerInfo.containsKey("build.timestamp") ? headerInfo.getString("build.timestamp") : ""; - } - - /** - * If the user supplied any additional attribution, return it here. - * @return Additional attribution if supplied by the user. Empty (non-null) list otherwise. - */ - private List getAttribution() { - List attributionLines = new ArrayList(); - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(analysisName)) { - Class walkerType = walkerManager.getWalkerClassByName(analysisName); - if(walkerType.isAnnotationPresent(Attribution.class)) - attributionLines.addAll(Arrays.asList(walkerType.getAnnotation(Attribution.class).value())); - } - return attributionLines; - } - - /** - * Retrieves additional information about GATK walkers. - * the code in HelpFormatter and supply it as a helper to this method. - * - * @return A string summarizing the walkers available in this distribution. - */ - private String getAdditionalHelp() { - String additionalHelp; - - // If no analysis name is present, fill in extra help on the walkers. - WalkerManager walkerManager = engine.getWalkerManager(); - String analysisName = getAnalysisName(); - if(analysisName != null && walkerManager.exists(getAnalysisName())) - additionalHelp = getWalkerHelp(walkerManager.getWalkerClassByName(getAnalysisName())); - else - additionalHelp = getAllWalkerHelp(); - - return additionalHelp; - } - - private static final int PACKAGE_INDENT = 1; - private static final int WALKER_INDENT = 3; - private static final String FIELD_SEPARATOR = " "; - - private String getWalkerHelp(Class walkerType) { - // Construct a help string to output details on this walker. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - formatter.format("Available Reference Ordered Data types:%n"); - formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures()); - formatter.format("%n"); - - formatter.format("For a full description of this walker, see its GATKdocs at:%n"); - formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType)); - - return additionalHelp.toString(); - } - - /** - * Load in additional help information about all available walkers. - * @return A string representation of the additional help. - */ - private String getAllWalkerHelp() { - // Construct a help string to output available walkers. - StringBuilder additionalHelp = new StringBuilder(); - Formatter formatter = new Formatter(additionalHelp); - - // Get the list of walker names from the walker manager. - WalkerManager walkerManager = engine.getWalkerManager(); - - // Build a list sorted by walker display name. As this information is collected, keep track of the longest - // package / walker name for later formatting. - SortedSet helpText = new TreeSet(new HelpEntryComparator()); - - int longestPackageName = 0; - int longestWalkerName = 0; - for(Map.Entry>> walkersByPackage: walkerManager.getWalkerNamesByPackage(true).entrySet()) { - // Get the display name. - String packageName = walkersByPackage.getKey(); - String packageDisplayName = walkerManager.getPackageDisplayName(walkersByPackage.getKey()); - String packageHelpText = walkerManager.getPackageSummaryText(packageName); - - // Compute statistics about which names is longest. - longestPackageName = Math.max(longestPackageName,packageDisplayName.length()); - - SortedSet walkersInPackage = new TreeSet(new HelpEntryComparator()); - for(Class walkerType: walkersByPackage.getValue()) { - String walkerName = walkerType.getName(); - String walkerDisplayName = walkerManager.getName(walkerType); - String walkerHelpText = walkerManager.getWalkerSummaryText(walkerType); - - longestWalkerName = Math.max(longestWalkerName,walkerManager.getName(walkerType).length()); - - walkersInPackage.add(new HelpEntry(walkerName,walkerDisplayName,walkerHelpText)); - } - - // Dump the walkers into the sorted set. - helpText.add(new HelpEntry(packageName,packageDisplayName,packageHelpText,Collections.unmodifiableSortedSet(walkersInPackage))); - } - - final int headerWidth = Math.max(longestPackageName+PACKAGE_INDENT,longestWalkerName+WALKER_INDENT); - - - for(HelpEntry packageHelp: helpText) { - printDescriptorLine(formatter,PACKAGE_INDENT,packageHelp.displayName,headerWidth,FIELD_SEPARATOR,packageHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - for(HelpEntry walkerHelp: packageHelp.children) - printDescriptorLine(formatter,WALKER_INDENT,walkerHelp.displayName,headerWidth,FIELD_SEPARATOR,walkerHelp.summary,TextFormattingUtils.DEFAULT_LINE_WIDTH); - - // Print a blank line between sets of walkers. - printDescriptorLine(formatter,0,"",headerWidth,FIELD_SEPARATOR,"", TextFormattingUtils.DEFAULT_LINE_WIDTH); - } - - return additionalHelp.toString(); - } - - private void printDescriptorLine(Formatter formatter, - int headerIndentWidth, - String header, - int headerWidth, - String fieldSeparator, - String description, - int lineWidth) { - final int headerPaddingWidth = headerWidth - header.length() - headerIndentWidth; - final int descriptionWidth = lineWidth - fieldSeparator.length() - headerWidth; - List wordWrappedText = TextFormattingUtils.wordWrap(description,descriptionWidth); - - String headerIndentFormatString = headerIndentWidth > 0 ? "%" + headerIndentWidth + "s" : "%s"; - String headerPaddingFormatString = headerPaddingWidth > 0 ? "%" + headerPaddingWidth + "s" : "%s"; - String headerWidthFormatString = headerWidth > 0 ? "%" + headerWidth + "s" : "%s"; - - // Output description line. - formatter.format(headerIndentFormatString + "%s" + headerPaddingFormatString + "%s%s%n", - "", header, "", fieldSeparator, wordWrappedText.size()>0?wordWrappedText.get(0):""); - for(int i = 1; i < wordWrappedText.size(); i++) - formatter.format(headerWidthFormatString + "%s%s%n", "", fieldSeparator, wordWrappedText.get(i)); - } - -} - -/** - * Represents a given help entry; contains a display name, a summary and optionally some children. - */ -class HelpEntry { - public final String uid; - public final String displayName; - public final String summary; - public final SortedSet children; - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - * @param children children for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary, SortedSet children) { - this.uid = uid; - this.displayName = displayName; - this.summary = summary; - this.children = children; - } - - /** - * Create a new help entry with the given display name, summary and children. - * @param uid a unique identifier. Usually, the java package. - * @param displayName display name for this help entry. - * @param summary summary for this help entry. - */ - public HelpEntry(String uid, String displayName, String summary) { - this(uid,displayName,summary,null); - } - -} - -/** - * Compare two help entries by display name. - */ -class HelpEntryComparator implements Comparator { - private static TextFormattingUtils.CaseInsensitiveComparator textComparator = new TextFormattingUtils.CaseInsensitiveComparator(); - - /** - * Compares the order of lhs to rhs, not taking case into account. - * @param lhs First object to compare. - * @param rhs Second object to compare. - * @return 0 if objects are identical; -1 if lhs is before rhs, 1 if rhs is before lhs. Nulls are treated as after everything else. - */ - public int compare(HelpEntry lhs, HelpEntry rhs) { - if(lhs == null && rhs == null) return 0; - if(lhs == null || lhs.displayName.equals("")) return 1; - if(rhs == null || rhs.displayName.equals("")) return -1; - return lhs.displayName.equals(rhs.displayName) ? textComparator.compare(lhs.uid,rhs.uid) : textComparator.compare(lhs.displayName,rhs.displayName); - } - - -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java deleted file mode 100644 index abb699301..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngine.java +++ /dev/null @@ -1,1280 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import htsjdk.variant.vcf.VCFConstants; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.*; -import org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.executive.MicroScheduler; -import org.broadinstitute.gatk.engine.filters.FilterManager; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.filters.ReadGroupBlackListFilter; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; -import org.broadinstitute.gatk.engine.io.stubs.Stub; -import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.iterators.ReadTransformersMode; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.engine.refdata.tracks.IndexDictionaryUtils; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.samples.SampleDB; -import org.broadinstitute.gatk.engine.samples.SampleDBBuilder; -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.tools.walkers.genotyper.IndexedSampleList; -import org.broadinstitute.gatk.tools.walkers.genotyper.SampleList; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; -import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; -import org.broadinstitute.gatk.utils.recalibration.BQSRArgumentSet; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import org.broadinstitute.gatk.utils.text.XReadLines; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; -import java.util.concurrent.TimeUnit; - -import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.getWalkerDeprecationInfo; -import static org.broadinstitute.gatk.utils.DeprecatedToolChecks.isDeprecatedWalker; - -/** - * A GenomeAnalysisEngine that runs a specified walker. - */ -public class GenomeAnalysisEngine { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); - public static final long NO_RUNTIME_LIMIT = -1; - - /** - * The GATK command-line argument parsing code. - */ - private ParsingEngine parsingEngine; - - /** - * The genomeLocParser can create and parse GenomeLocs. - */ - private GenomeLocParser genomeLocParser; - - /** - * Accessor for sharded read data. - */ - private SAMDataSource readsDataSource = null; - - /** - * Accessor for sharded reference data. - */ - private ReferenceDataSource referenceDataSource = null; - - /** - * Accessor for sample metadata - */ - private SampleDB sampleDB = new SampleDB(); - - /** - * Accessor for sharded reference-ordered data. - */ - private List rodDataSources; - - // our argument collection - private GATKArgumentCollection argCollection; - - /** - * Collection of intervals used by the engine. - */ - private GenomeLocSortedSet intervals = null; - - /** - * Explicitly assign the interval set to use for this traversal (for unit testing purposes) - * @param intervals set of intervals to use for this traversal - */ - public void setIntervals( GenomeLocSortedSet intervals ) { - this.intervals = intervals; - } - - /** - * Collection of inputs used by the engine. - */ - private Map inputs = new HashMap(); - - /** - * Collection of outputs used by the engine. - */ - private Collection> outputs = new ArrayList>(); - - /** - * Collection of the filters applied to the input data. - */ - private Collection filters; - - /** - * Collection of the read transformers applied to the reads - */ - private List readTransformers; - - /** - * Controls the allocation of threads between CPU vs IO. - */ - private ThreadAllocation threadAllocation; - - private ReadMetrics cumulativeMetrics = null; - - /** - * A currently hacky unique name for this GATK instance - */ - private String myName = "GATK_" + Math.abs(getRandomGenerator().nextInt()); - - /** - * our walker manager - */ - private final WalkerManager walkerManager = new WalkerManager(); - - private Walker walker; - - public void setWalker(Walker walker) { - this.walker = walker; - } - - /** - * The short name of the current GATK walker as a string - * @return a non-null String - */ - public String getWalkerName() { - return getWalkerName(walker.getClass()); - } - - /** - * A processed collection of SAM reader identifiers. - */ - private Collection samReaderIDs = Collections.emptyList(); - - /** - * Set the SAM/BAM files over which to traverse. - * @param samReaderIDs Collection of ids to use during this traversal. - */ - public void setSAMFileIDs(Collection samReaderIDs) { - this.samReaderIDs = samReaderIDs; - } - - /** - * Collection of reference metadata files over which to traverse. - */ - private Collection referenceMetaDataFiles; - - /** - * The threading efficiency monitor we use in the GATK to monitor our efficiency. - * - * May be null if one isn't active, or hasn't be initialized yet - */ - private ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * The global progress meter we are using to track our progress through the genome - */ - private ProgressMeter progressMeter = null; - - /** - * Set the reference metadata files to use for this traversal. - * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. - */ - public void setReferenceMetaDataFiles(Collection referenceMetaDataFiles) { - this.referenceMetaDataFiles = referenceMetaDataFiles; - } - - /** - * The maximum runtime of this engine, in nanoseconds, set during engine initialization - * from the GATKArgumentCollection command line value - */ - private long runtimeLimitInNanoseconds = -1; - - /** - * Static random number generator and seed. - */ - private static final long GATK_RANDOM_SEED = 47382911L; - private static Random randomGenerator = new Random(GATK_RANDOM_SEED); - public static Random getRandomGenerator() { return randomGenerator; } - public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } - public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } - - /** - * Base Quality Score Recalibration helper object - */ - private BQSRArgumentSet bqsrArgumentSet = null; - public BQSRArgumentSet getBQSRArgumentSet() { return bqsrArgumentSet; } - public boolean hasBQSRArgumentSet() { return bqsrArgumentSet != null; } - public void setBaseRecalibration(final GATKArgumentCollection args) { - bqsrArgumentSet = new BQSRArgumentSet(args); - } - - /** - * Actually run the GATK with the specified walker. - * - * @return the value of this traversal. - */ - public Object execute() { - // first thing is to make sure the AWS keys can be decrypted - GATKRunReport.checkAWSAreValid(); - - //HeapSizeMonitor monitor = new HeapSizeMonitor(); - //monitor.start(); - setStartTime(new java.util.Date()); - - final GATKArgumentCollection args = this.getArguments(); - - // validate our parameters - if (args == null) { - throw new ReviewedGATKException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null."); - } - - // validate our parameters - if (this.walker == null) - throw new ReviewedGATKException("The walker passed to GenomeAnalysisEngine can not be null."); - - if (args.nonDeterministicRandomSeed) - resetRandomGenerator(System.currentTimeMillis()); - - // if the use specified an input BQSR recalibration table then enable on the fly recalibration - if (args.BQSR_RECAL_FILE != null) - setBaseRecalibration(args); - - // setup the runtime limits - setupRuntimeLimits(args); - - // Determine how the threads should be divided between CPU vs. IO. - determineThreadAllocation(); - - // Prepare the data for traversal. - initializeDataSources(); - - // initialize and validate the interval list - initializeIntervals(); - validateSuppliedIntervals(); - - // check to make sure that all sequence dictionaries are compatible with the reference's sequence dictionary - validateDataSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), rodDataSources); - - // initialize sampleDB - initializeSampleDB(); - - // our microscheduler, which is in charge of running everything - MicroScheduler microScheduler = createMicroscheduler(); - threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor(); - - // create temp directories as necessary - initializeTempDirectory(); - - // create the output streams - initializeOutputStreams(microScheduler.getOutputTracker()); - - // Initializing the shard iterator / BAM schedule might take some time, so let the user know vaguely what's going on - logger.info("Preparing for traversal" + - (readsDataSource.getReaderIDs().size() > 0 ? String.format(" over %d BAM files", readsDataSource.getReaderIDs().size()) : "")); - Iterable shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); - logger.info("Done preparing for traversal"); - - // execute the microscheduler, storing the results - return microScheduler.execute(this.walker, shardStrategy); - - //monitor.stop(); - //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); - - //return result; - } - - /** - * Retrieves an instance of the walker based on the walker name. - * - * @param walkerName Name of the walker. Must not be null. If the walker cannot be instantiated, an exception will be thrown. - * @return An instance of the walker. - */ - public Walker getWalkerByName(String walkerName) { - try { - return walkerManager.createByName(walkerName); - } catch ( UserException e ) { - if ( isDeprecatedWalker(walkerName) ) { - e = new UserException.DeprecatedWalker(walkerName, getWalkerDeprecationInfo(walkerName)); - } - throw e; - } - } - - /** - * Gets the name of a given walker type. - * @param walkerType Type of walker. - * @return Name of the walker. - */ - public String getWalkerName(Class walkerType) { - return walkerManager.getName(walkerType); - } - - public String getName() { - return myName; - } - - /** - * Gets a list of the filters to associate with the given walker. Will NOT initialize the engine with this filters; - * the caller must handle that directly. - * @return A collection of available filters. - */ - public Collection createFilters() { - final List filters = new LinkedList<>(); - - // First add the user requested filters - if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) - filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); - for(final String filterName: this.getArguments().readFilters) - filters.add(this.getFilterManager().createByName(filterName)); - - // now add the walker default filters. This ordering is critical important if - // users need to apply filters that fix up reads that would be removed by default walker filters - filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); - - return Collections.unmodifiableList(filters); - } - - /** - * Returns a list of active, initialized read transformers - * - * @param walker the walker we need to apply read transformers too - */ - public void initializeReadTransformers(final Walker walker) { - // keep a list of the active read transformers sorted based on priority ordering - List activeTransformers = new ArrayList(); - - final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class); - final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null; - - final PluginManager pluginManager = new PluginManager(ReadTransformer.class); - - for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) { - transformer.initialize(overrideTime, this, walker); - if ( transformer.enabled() ) - activeTransformers.add(transformer); - } - - setReadTransformers(activeTransformers); - } - - public List getReadTransformers() { - return readTransformers; - } - - /* - * Sanity checks that incompatible read transformers are not active together (and throws an exception if they are). - * - * @param readTransformers the active read transformers - */ - protected void checkActiveReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new IllegalArgumentException("read transformers cannot be null"); - - ReadTransformer sawMustBeFirst = null; - ReadTransformer sawMustBeLast = null; - - for ( final ReadTransformer r : readTransformers ) { - if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_FIRST ) { - if ( sawMustBeFirst != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeFirst.toString(), r.toString()); - sawMustBeFirst = r; - } else if ( r.getOrderingConstraint() == ReadTransformer.OrderingConstraint.MUST_BE_LAST ) { - if ( sawMustBeLast != null ) - throw new UserException.IncompatibleReadFiltersException(sawMustBeLast.toString(), r.toString()); - sawMustBeLast = r; - } - } - } - - protected void setReadTransformers(final List readTransformers) { - if ( readTransformers == null ) - throw new ReviewedGATKException("read transformers cannot be null"); - - // sort them in priority order - Collections.sort(readTransformers, new ReadTransformer.ReadTransformerComparator()); - - // make sure we don't have an invalid set of active read transformers - checkActiveReadTransformers(readTransformers); - - this.readTransformers = readTransformers; - } - - /** - * Parse out the thread allocation from the given command-line argument. - */ - private void determineThreadAllocation() { - if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); - if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); - if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); - - this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, - argCollection.numberOfCPUThreadsPerDataThread, - argCollection.numberOfIOThreads, - argCollection.monitorThreadEfficiency); - } - - public int getTotalNumberOfThreads() { - return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); - } - - - - /** - * Allow subclasses and others within this package direct access to the walker manager. - * @return The walker manager used by this package. - */ - protected WalkerManager getWalkerManager() { - return walkerManager; - } - - /** - * setup a microscheduler - * - * @return a new microscheduler - */ - private MicroScheduler createMicroscheduler() { - // Temporarily require all walkers to have a reference, even if that reference is not conceptually necessary. - if ((walker instanceof ReadWalker || walker instanceof DuplicateWalker || walker instanceof ReadPairWalker) && - this.getArguments().referenceFile == null) { - throw new UserException.CommandLineException("Read-based traversals require a reference file but none was given"); - } - - return MicroScheduler.create(this,walker,this.getReadsDataSource(),this.getReferenceDataSource().getReference(),this.getRodDataSources(),threadAllocation); - } - - protected DownsamplingMethod getDownsamplingMethod() { - GATKArgumentCollection argCollection = this.getArguments(); - - DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); - DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker); - - DownsamplingMethod method = commandLineMethod != null ? commandLineMethod : walkerMethod; - method.checkCompatibilityWithWalker(walker); - return method; - } - - protected void setDownsamplingMethod(DownsamplingMethod method) { - argCollection.setDownsamplingMethod(method); - } - - protected boolean includeReadsWithDeletionAtLoci() { - return walker.includeReadsWithDeletionAtLoci(); - } - - /** - * Verifies that the supplied set of reads files mesh with what the walker says it requires; - * also makes sure that list of SAM files specified on the command line is not empty and contains - * no duplicates. - */ - protected void validateSuppliedReads() { - GATKArgumentCollection arguments = this.getArguments(); - final Boolean samFilesArePresent = (arguments.samFiles != null && !arguments.samFiles.isEmpty()); - - // Check what the walker says is required against what was provided on the command line. - if (WalkerManager.isRequired(walker, DataSource.READS) && !samFilesArePresent) - throw new ArgumentException("Walker requires reads but none were provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if (samFilesArePresent && !WalkerManager.isAllowed(walker, DataSource.READS)) - throw new ArgumentException("Walker does not allow reads but reads were provided."); - - //Make sure SAM list specified by the user (if necessary) is not empty - if(WalkerManager.isRequired(walker, DataSource.READS) && samFilesArePresent && samReaderIDs.isEmpty() ) { - throw new UserException("The list of input files does not contain any BAM files."); - } - - // Make sure no SAM files were specified multiple times by the user. - checkForDuplicateSamFiles(); - } - - /** - * Checks whether there are SAM files that appear multiple times in the fully unpacked list of - * SAM files (samReaderIDs). If there are, throws an ArgumentException listing the files in question. - */ - protected void checkForDuplicateSamFiles() { - Set encounteredSamFiles = new HashSet(); - Set duplicateSamFiles = new LinkedHashSet(); - - for ( SAMReaderID samFile : samReaderIDs ) { - if ( encounteredSamFiles.contains(samFile) ) { - duplicateSamFiles.add(samFile.getSamFilePath()); - } - else { - encounteredSamFiles.add(samFile); - } - } - - if ( duplicateSamFiles.size() > 0 ) { - throw new UserException("The following BAM files appear multiple times in the list of input files: " + - duplicateSamFiles + " BAM files may be specified at most once."); - } - - } - - /** - * Verifies that the supplied reference file mesh with what the walker says it requires. - */ - protected void validateSuppliedReference() { - GATKArgumentCollection arguments = this.getArguments(); - // Check what the walker says is required against what was provided on the command line. - // TODO: Temporarily disabling WalkerManager.isRequired check on the reference because the reference is always required. - if (/*WalkerManager.isRequired(walker, DataSource.REFERENCE) &&*/ arguments.referenceFile == null) - throw new ArgumentException("Walker requires a reference but none was provided."); - - // Check what the walker says is allowed against what was provided on the command line. - if (arguments.referenceFile != null && !WalkerManager.isAllowed(walker, DataSource.REFERENCE)) - throw new ArgumentException("Walker does not allow a reference but one was provided."); - } - - protected void validateSuppliedIntervals() { - // Only read walkers support '-L unmapped' intervals. Trap and validate any other instances of -L unmapped. - if(!(walker instanceof ReadWalker)) { - GenomeLocSortedSet intervals = getIntervals(); - if(intervals != null && getIntervals().contains(GenomeLoc.UNMAPPED)) - throw new ArgumentException("Interval list specifies unmapped region. Only read walkers may include the unmapped region."); - } - - // If intervals is non-null and empty at this point, it means that the list of intervals to process - // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since - // this was very likely unintentional, the user should be informed of this. Note that this is different - // from the case where intervals == null, which indicates that there were no interval arguments. - if ( intervals != null && intervals.isEmpty() ) { - logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); - } - - // TODO: add a check for ActiveRegion walkers to prevent users from passing an entire contig/chromosome - } - - /** - * Get the sharding strategy given a driving data source. - * - * @param readsDataSource readsDataSource - * @param drivingDataSource Data on which to shard. - * @param intervals intervals - * @return the sharding strategy - */ - protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { - ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); - DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null; - ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); - - // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. - if(!readsDataSource.isEmpty()) { - if(!readsDataSource.hasIndex() && !exclusions.contains(ValidationExclusion.TYPE.ALLOW_UNINDEXED_BAM)) - throw new UserException.CommandLineException("Cannot process the provided BAM file(s) because they were not indexed. The GATK does offer limited processing of unindexed BAMs in --unsafe mode, but this GATK feature is currently unsupported."); - if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) - throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - - if(walker instanceof LocusWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new LocusShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } - else if(walker instanceof ActiveRegionWalker) { - if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(new ActiveRegionShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); - } - else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { - // Apply special validation to read pair walkers. - if(walker instanceof ReadPairWalker) { - if(readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) - throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.queryname, "Read pair walkers are exceptions in that they cannot be run on coordinate-sorted BAMs but instead require query name-sorted files. You will need to resort your input BAM file in query name order to use this walker."); - if(intervals != null && !intervals.isEmpty()) - throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); - } - - if(intervals == null) - return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(intervals, new ReadShardBalancer()); - } - else - throw new ReviewedGATKException("Unable to determine walker type for walker " + walker.getClass().getName()); - } - else { - // TODO -- Determine what the ideal shard size should be here. Matt suggested that a multiple of 16K might work well - // TODO -- (because of how VCF indexes work), but my empirical experience has been simply that the larger the shard - // TODO -- size the more efficient the traversal (at least for RODWalkers). Keeping the previous values for now. [EB] - final int SHARD_SIZE = walker instanceof RodWalker ? 1000000 : 100000; - if(intervals == null) - return referenceDataSource.createShardsOverEntireReference(readsDataSource,genomeLocParser,SHARD_SIZE); - else - return referenceDataSource.createShardsOverIntervals(readsDataSource,intervals,SHARD_SIZE); - } - } - - protected boolean flashbackData() { - return walker instanceof ReadWalker; - } - - /** - * Create the temp directory if it doesn't exist. - */ - private void initializeTempDirectory() { - File tempDir = new File(System.getProperty("java.io.tmpdir")); - if (!tempDir.exists() && !tempDir.mkdirs()) - throw new UserException.BadTmpDir("Unable to create directory"); - } - - /** - * Initialize the output streams as specified by the user. - * - * @param outputTracker the tracker supplying the initialization data. - */ - private void initializeOutputStreams(final OutputTracker outputTracker) { - for (final Map.Entry input : getInputs().entrySet()) - outputTracker.addInput(input.getKey(), input.getValue()); - for (final Stub stub : getOutputs()) { - stub.processArguments(argCollection); - outputTracker.addOutput(stub); - } - - outputTracker.prepareWalker(walker, getArguments().strictnessLevel); - } - - public ReferenceDataSource getReferenceDataSource() { - return referenceDataSource; - } - - public GenomeLocParser getGenomeLocParser() { - return genomeLocParser; - } - - /** - * Manage lists of filters. - */ - private final FilterManager filterManager = new FilterManager(); - - private Date startTime = null; // the start time for execution - - public void setParser(ParsingEngine parsingEngine) { - this.parsingEngine = parsingEngine; - } - - /** - * Explicitly set the GenomeLocParser, for unit testing. - * @param genomeLocParser GenomeLocParser to use. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - /** - * Sets the start time when the execute() function was last called - * @param startTime the start time when the execute() function was last called - */ - protected void setStartTime(Date startTime) { - this.startTime = startTime; - } - - /** - * @return the start time when the execute() function was last called - */ - public Date getStartTime() { - return startTime; - } - - /** - * Setup the intervals to be processed - */ - protected void initializeIntervals() { - intervals = IntervalUtils.parseIntervalArguments(this.referenceDataSource, argCollection.intervalArguments); - } - - /** - * Add additional, externally managed IO streams for inputs. - * - * @param argumentSource Field into which to inject the value. - * @param value Instance to inject. - */ - public void addInput(ArgumentSource argumentSource, Object value) { - inputs.put(argumentSource, value); - } - - /** - * Add additional, externally managed IO streams for output. - * - * @param stub Instance to inject. - */ - public void addOutput(Stub stub) { - outputs.add(stub); - } - - /** - * Returns the tag associated with a given command-line argument. - * @param key Object for which to inspect the tag. - * @return Tags object associated with the given key, or an empty Tag structure if none are present. - */ - public Tags getTags(Object key) { - return parsingEngine.getTags(key); - } - - protected void initializeDataSources() { - logger.info("Strictness is " + argCollection.strictnessLevel); - - validateSuppliedReference(); - setReferenceDataSource(argCollection.referenceFile); - - validateSuppliedReads(); - initializeReadTransformers(walker); - - final Map sampleRenameMap = argCollection.sampleRenameMappingFile != null ? - loadSampleRenameMap(argCollection.sampleRenameMappingFile) : - null; - - readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference(), sampleRenameMap); - - for (ReadFilter filter : filters) - filter.initialize(this); - - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference - rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(), - genomeLocParser,argCollection.unsafe,sampleRenameMap); - } - - /** - * Purely for testing purposes. Do not use unless you absolutely positively know what you are doing (or - * need to absolutely positively kill everyone in the room) - * @param dataSource - */ - public void setReadsDataSource(final SAMDataSource dataSource) { - this.readsDataSource = dataSource; - } - - /** - * Entry-point function to initialize the samples database from input data and pedigree arguments - */ - private void initializeSampleDB() { - SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); - sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); - sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); - sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); - sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); - sampleDB = sampleDBBuilder.getFinalSampleDB(); - } - - /** - * Gets a unique identifier for the reader sourcing this read. - * @param read Read to examine. - * @return A unique identifier for the source file of this read. Exception if not found. - */ - public SAMReaderID getReaderIDForRead(final SAMRecord read) { - return getReadsDataSource().getReaderID(read); - } - - /** - * Gets the source file for this read. - * @param id Unique identifier determining which input file to use. - * @return The source filename for this read. - */ - public File getSourceFileForReaderID(final SAMReaderID id) { - return getReadsDataSource().getSAMFile(id); - } - - /** - * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). - * - * @param reads Reads data source. - * @param reference Reference data source. - * @param rods a collection of the reference ordered data tracks - */ - private void validateDataSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods) { - if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) - return; - - // Compile a set of sequence names that exist in the reference file. - SAMSequenceDictionary referenceDictionary = reference.getSequenceDictionary(); - - if (!reads.isEmpty()) { - // Compile a set of sequence names that exist in the BAM files. - SAMSequenceDictionary readsDictionary = reads.getHeader().getSequenceDictionary(); - - if (readsDictionary.size() == 0) { - logger.info("Reads file is unmapped. Skipping validation against reference."); - return; - } - - // compare the reads to the reference - SequenceDictionaryUtils.validateDictionaries(logger, getArguments().unsafe, "reads", readsDictionary, - "reference", referenceDictionary, true, intervals); - } - - for (ReferenceOrderedDataSource rod : rods) - IndexDictionaryUtils.validateTrackSequenceDictionary(rod.getName(), rod.getSequenceDictionary(), referenceDictionary, getArguments().unsafe); - } - - /** - * Gets a data source for the given set of reads. - * - * @param argCollection arguments - * @param genomeLocParser parser - * @param refReader reader - * @return A data source for the given set of reads. - */ - private SAMDataSource createReadsDataSource(final GATKArgumentCollection argCollection, final GenomeLocParser genomeLocParser, - final IndexedFastaSequenceFile refReader, final Map sampleRenameMap) { - DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); - - // Synchronize the method back into the collection so that it shows up when - // interrogating for the downsampling method during command line recreation. - setDownsamplingMethod(downsamplingMethod); - - logger.info(downsamplingMethod); - - if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) - throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); - - boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); - - if (argCollection.keepProgramRecords) - removeProgramRecords = false; - - final boolean keepReadsInLIBS = walker instanceof ActiveRegionWalker; - - return new SAMDataSource( - samReaderIDs, - threadAllocation, - argCollection.numberOfBAMFileHandles, - genomeLocParser, - argCollection.useOriginalBaseQualities, - argCollection.strictnessLevel, - argCollection.readBufferSize, - downsamplingMethod, - new ValidationExclusion(Arrays.asList(argCollection.unsafe)), - filters, - readTransformers, - includeReadsWithDeletionAtLoci(), - argCollection.defaultBaseQualities, - removeProgramRecords, - keepReadsInLIBS, - sampleRenameMap, - argCollection.intervalArguments.intervalMerging); - } - - /** - * Loads a user-provided sample rename map file for use in on-the-fly sample renaming into an in-memory - * HashMap. This file must consist of lines with two whitespace-separated fields, the second of which - * may contain whitespace: - * - * absolute_path_to_file new_sample_name - * - * The engine will verify that each file contains data from only one sample when the on-the-fly sample - * renaming feature is being used. Note that this feature works only with bam and vcf files. - * - * @param sampleRenameMapFile sample rename map file from which to load data - * @return a HashMap containing the contents of the map file, with the keys being the input file paths and - * the values being the new sample names. - */ - protected Map loadSampleRenameMap( final File sampleRenameMapFile ) { - logger.info("Renaming samples from input files on-the-fly using mapping file " + sampleRenameMapFile.getAbsolutePath()); - - final Map sampleRenameMap = new HashMap<>((int)sampleRenameMapFile.length() / 50); - - try { - for ( final String line : new XReadLines(sampleRenameMapFile) ) { - final String[] tokens = line.split("\\s+", 2); - - if ( tokens.length != 2 ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Encountered a line with %s fields instead of the required 2 fields. Line was: %s", - tokens.length, line)); - } - - final File inputFile = new File(tokens[0]); - final String newSampleName = tokens[1].trim(); - - if (newSampleName.contains(VCFConstants.FIELD_SEPARATOR)) { - throw new UserException.MalformedFile(sampleRenameMapFile, String.format( - "Encountered illegal sample name; sample names may not include the VCF field delimiter (%s). Sample name: %s; line: %s", - VCFConstants.FIELD_SEPARATOR, - newSampleName, - line - )); - } - - if ( ! inputFile.isAbsolute() ) { - throw new UserException.MalformedFile(sampleRenameMapFile, "Input file path not absolute at line: " + line); - } - - final String inputFilePath = inputFile.getAbsolutePath(); - - if ( sampleRenameMap.containsKey(inputFilePath) ) { - throw new UserException.MalformedFile(sampleRenameMapFile, - String.format("Input file %s appears more than once", inputFilePath)); - } - - sampleRenameMap.put(inputFilePath, newSampleName); - } - } - catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(sampleRenameMapFile, e); - } - - return sampleRenameMap; - } - - - /** - * Opens a reference sequence file paired with an index. Only public for testing purposes - * - * @param refFile Handle to a reference sequence file. Non-null. - */ - public void setReferenceDataSource(File refFile) { - this.referenceDataSource = new ReferenceDataSource(refFile); - genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); - } - - /** - * Open the reference-ordered data sources. - * - * @param referenceMetaDataFiles collection of RMD descriptors to load and validate. - * @param sequenceDictionary GATK-wide sequnce dictionary to use for validation. - * @param genomeLocParser to use when creating and validating GenomeLocs. - * @param validationExclusionType potentially indicate which validations to include / exclude. - * @param sampleRenameMap map of file -> new sample name used when doing on-the-fly sample renaming - * - * @return A list of reference-ordered data sources. - */ - private List getReferenceOrderedDataSources(final Collection referenceMetaDataFiles, - final SAMSequenceDictionary sequenceDictionary, - final GenomeLocParser genomeLocParser, - final ValidationExclusion.TYPE validationExclusionType, - final Map sampleRenameMap) { - final RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, validationExclusionType, - getArguments().disableAutoIndexCreationAndLockingWhenReadingRods, - sampleRenameMap); - - final List dataSources = new ArrayList(); - for (RMDTriplet fileDescriptor : referenceMetaDataFiles) - dataSources.add(new ReferenceOrderedDataSource(fileDescriptor, - builder, - sequenceDictionary, - genomeLocParser, - flashbackData())); - - return dataSources; - } - - /** - * Returns the SAM File Header from the input reads' data source file - * @return the SAM File Header from the input reads' data source file - */ - public SAMFileHeader getSAMFileHeader() { - return readsDataSource.getHeader(); - } - - public boolean lenientVCFProcessing() { - return lenientVCFProcessing(argCollection.unsafe); - } - - public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) { - return val == ValidationExclusion.TYPE.ALL - || val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING; - } - - /** - * Returns the unmerged SAM file header for an individual reader. - * @param reader The reader. - * @return Header for that reader or null if not available. - */ - public SAMFileHeader getSAMFileHeader(SAMReaderID reader) { - return readsDataSource == null ? null : readsDataSource.getHeader(reader); - } - - /** - * Returns an ordered list of the unmerged SAM file headers known to this engine. - * @return list of header for each input SAM file, in command line order - */ - public List getSAMFileHeaders() { - final List headers = new ArrayList(); - for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { - headers.add(getReadsDataSource().getHeader(id)); - } - return headers; - } - - /** - * Gets the master sequence dictionary for this GATK engine instance - * @return a never-null dictionary listing all of the contigs known to this engine instance - */ - public SAMSequenceDictionary getMasterSequenceDictionary() { - return getReferenceDataSource().getReference().getSequenceDictionary(); - } - - /** - * Returns data source object encapsulating all essential info and handlers used to traverse - * reads; header merger, individual file readers etc can be accessed through the returned data source object. - * - * @return the reads data source - */ - public SAMDataSource getReadsDataSource() { - return this.readsDataSource; - } - - /** - * Sets the collection of GATK main application arguments. - * - * @param argCollection the GATK argument collection - */ - public void setArguments(GATKArgumentCollection argCollection) { - this.argCollection = argCollection; - } - - /** - * Gets the collection of GATK main application arguments. - * - * @return the GATK argument collection - */ - public GATKArgumentCollection getArguments() { - return this.argCollection; - } - - /** - * Get the list of intervals passed to the engine. - * @return List of intervals, or null if no intervals are in use - */ - public GenomeLocSortedSet getIntervals() { - return this.intervals; - } - - /** - * Get the list of regions of the genome being processed. If the user - * requested specific intervals, return those, otherwise return regions - * corresponding to the entire genome. Never returns null. - * - * @return a non-null set of intervals being processed - */ - @Ensures("result != null") - public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() { - if ( getIntervals() == null ) - // if we don't have any intervals defined, create intervals from the reference itself - return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary()); - else - return getIntervals(); - } - - /** - * Gets the list of filters employed by this engine. - * @return Collection of filters (actual instances) used by this engine. - */ - public Collection getFilters() { - return this.filters; - } - - /** - * Sets the list of filters employed by this engine. - * @param filters Collection of filters (actual instances) used by this engine. - */ - public void setFilters(Collection filters) { - this.filters = filters; - } - - /** - * Gets the filter manager for this engine. - * @return filter manager for this engine. - */ - protected FilterManager getFilterManager() { - return filterManager; - } - - /** - * Gets the input sources for this engine. - * @return input sources for this engine. - */ - protected Map getInputs() { - return inputs; - } - - /** - * Gets the output stubs for this engine. - * @return output stubs for this engine. - */ - protected Collection> getOutputs() { - return outputs; - } - - /** - * Returns data source objects encapsulating all rod data; - * individual rods can be accessed through the returned data source objects. - * - * @return the rods data sources, never {@code null}. - */ - public List getRodDataSources() { - return this.rodDataSources; - } - - /** - * Gets cumulative metrics about the entire run to this point. - * Returns a clone of this snapshot in time. - * @return cumulative metrics about the entire run at this point. ReadMetrics object is a unique instance and is - * owned by the caller; the caller can do with the object what they wish. - */ - public ReadMetrics getCumulativeMetrics() { - // todo -- probably shouldn't be lazy - if ( cumulativeMetrics == null ) - cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); - return cumulativeMetrics; - } - - /** - * Return the global ThreadEfficiencyMonitor, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - // ------------------------------------------------------------------------------------- - // - // code for working with Samples database - // - // ------------------------------------------------------------------------------------- - - public SampleDB getSampleDB() { - return this.sampleDB; - } - - public Map getApproximateCommandLineArguments(Object... argumentProviders) { - return CommandLineUtils.getApproximateCommandLineArguments(parsingEngine,argumentProviders); - } - - public String createApproximateCommandLineArgumentString(Object... argumentProviders) { - return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); - } - - // ------------------------------------------------------------------------------------- - // - // code for working with progress meter - // - // ------------------------------------------------------------------------------------- - - /** - * Register the global progress meter with this engine - * - * Calling this function more than once will result in an IllegalStateException - * - * @param meter a non-null progress meter - */ - public void registerProgressMeter(final ProgressMeter meter) { - if ( meter == null ) throw new IllegalArgumentException("Meter cannot be null"); - if ( progressMeter != null ) throw new IllegalStateException("Progress meter already set"); - - progressMeter = meter; - } - - /** - * Get the progress meter being used by this engine. May be null if no meter has been registered yet - * @return a potentially null pointer to the progress meter - */ - public ProgressMeter getProgressMeter() { - return progressMeter; - } - - /** - * Does the current runtime in unit exceed the runtime limit, if one has been provided? - * - * @return false if not limit was requested or if runtime <= the limit, true otherwise - */ - public boolean exceedsRuntimeLimit() { - if ( progressMeter == null ) - // not yet initialized or not set because of testing - return false; - - if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT ) - return false; - else { - final long runtime = progressMeter.getRuntimeInNanosecondsUpdatedPeriodically(); - if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime); - final long maxRuntimeNano = getRuntimeLimitInNanoseconds(); - return runtime > maxRuntimeNano; - } - } - - /** - * @return the runtime limit in nanoseconds, or -1 if no limit was specified - */ - public long getRuntimeLimitInNanoseconds() { - return runtimeLimitInNanoseconds; - } - - /** - * Setup the runtime limits for this engine, updating the runtimeLimitInNanoseconds - * as appropriate - * - * @param args the GATKArgumentCollection to retrieve our runtime limits from - */ - private void setupRuntimeLimits(final GATKArgumentCollection args) { - if ( args.maxRuntime == NO_RUNTIME_LIMIT ) - runtimeLimitInNanoseconds = -1; - else if (args.maxRuntime < 0 ) - throw new UserException.BadArgumentValue("maxRuntime", "must be >= 0 or == -1 (meaning no limit) but received negative value " + args.maxRuntime); - else { - runtimeLimitInNanoseconds = TimeUnit.NANOSECONDS.convert(args.maxRuntime, args.maxRuntimeUnits); - } - } - - /** - * Returns the sample list including all samples. - * @return never {@code null}. - */ - public SampleList getSampleList() { - return new IndexedSampleList(getSampleDB().getSampleNames()); - } - - /** - * Returns the sample list including samples in read inputs. - * @return never {@code null}. - */ - public SampleList getReadSampleList() { - return new IndexedSampleList(SampleUtils.getSAMFileSamples(getSAMFileHeader())); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java deleted file mode 100644 index 6ee9ad3a4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/ReadProperties.java +++ /dev/null @@ -1,198 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.ValidationStringency; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; - -import java.util.Collection; -import java.util.List; -/** - * User: hanna - * Date: May 14, 2009 - * Time: 4:06:26 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A data structure containing information about the reads data sources as well as - * information about how they should be downsampled, sorted, and filtered. - */ -public class ReadProperties { - private final Collection readers; - private final SAMFileHeader header; - private final SAMFileHeader.SortOrder sortOrder; - private final ValidationStringency validationStringency; - private final DownsamplingMethod downsamplingMethod; - private final ValidationExclusion exclusionList; - private final Collection supplementalFilters; - private final List readTransformers; - private final boolean keepUniqueReadListInLIBS; - private final boolean includeReadsWithDeletionAtLoci; - private final boolean useOriginalBaseQualities; - private final byte defaultBaseQualities; - - /** - * Return true if the walker wants to see reads that contain deletions when looking at locus pileups - * - * @return - */ - public boolean includeReadsWithDeletionAtLoci() { - return includeReadsWithDeletionAtLoci; - } - - public boolean keepUniqueReadListInLIBS() { - return keepUniqueReadListInLIBS; - } - - /** - * Gets a list of the files acting as sources of reads. - * @return A list of files storing reads data. - */ - public Collection getSAMReaderIDs() { - return readers; - } - - /** - * Gets the sam file header - * @return the sam file header - */ - public SAMFileHeader getHeader() { - return header; - } - - /** - * Gets the sort order of the reads - * @return the sort order of the reads - */ - public SAMFileHeader.SortOrder getSortOrder() { - return sortOrder; - } - - /** - * How strict should validation be? - * @return Stringency of validation. - */ - public ValidationStringency getValidationStringency() { - return validationStringency; - } - - /** - * Gets the method and parameters used when downsampling reads. - * @return Downsample fraction. - */ - public DownsamplingMethod getDownsamplingMethod() { - return downsamplingMethod; - } - - /** - * Return whether to 'verify' the reads as we pass through them. - * @return Whether to verify the reads. - */ - public ValidationExclusion getValidationExclusionList() { - return exclusionList; - } - - public Collection getSupplementalFilters() { - return supplementalFilters; - } - - - public List getReadTransformers() { - return readTransformers; - } - - /** - * Return whether to use original base qualities. - * @return Whether to use original base qualities. - */ - public boolean useOriginalBaseQualities() { - return useOriginalBaseQualities; - } - - /** - * @return Default base quality value to fill reads missing base quality information. - */ - public byte defaultBaseQualities() { - return defaultBaseQualities; - } - - /** - * Extract the command-line arguments having to do with reads input - * files and store them in an easy-to-work-with package. Constructor - * is package protected. - * @param samFiles list of reads files. - * @param header sam file header. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param strictness Stringency of reads file parsing. - * @param downsamplingMethod Method for downsampling reads at a given locus. - * @param exclusionList what safety checks we're willing to let slide - * @param supplementalFilters additional filters to dynamically apply. - * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method - * will explicitly list reads with deletion over the current reference base; otherwise, only observed - * bases will be seen in the pileups, and the deletions will be skipped silently. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param keepUniqueReadListInLIBS If true, we will tell LocusIteratorByState to track the unique reads it sees - * This is really useful for ActiveRegionTraversals - */ - public ReadProperties( Collection samFiles, - SAMFileHeader header, - SAMFileHeader.SortOrder sortOrder, - boolean useOriginalBaseQualities, - ValidationStringency strictness, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - List readTransformers, - boolean includeReadsWithDeletionAtLoci, - byte defaultBaseQualities, - final boolean keepUniqueReadListInLIBS) { - this.readers = samFiles; - this.header = header; - this.sortOrder = sortOrder; - this.validationStringency = strictness; - this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; - this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; - this.supplementalFilters = supplementalFilters; - this.readTransformers = readTransformers; - this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; - this.useOriginalBaseQualities = useOriginalBaseQualities; - this.defaultBaseQualities = defaultBaseQualities; - this.keepUniqueReadListInLIBS = keepUniqueReadListInLIBS; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java deleted file mode 100644 index fb9d48903..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/WalkerManager.java +++ /dev/null @@ -1,431 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.commandline.Hidden; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.filters.FilterManager; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.help.ResourceBundleExtractorDoclet; -import org.broadinstitute.gatk.utils.text.TextFormattingUtils; - -import java.lang.annotation.Annotation; -import java.util.*; - -/** - * Plugin manager that also provides various utilities for inspecting Walkers. - */ -public class WalkerManager extends PluginManager { - - /** - * A collection of help text for walkers and their enclosing packages. - */ - private ResourceBundle helpText; - - public WalkerManager() { - super(Walker.class,"walker",""); - helpText = TextFormattingUtils.loadResourceBundle("GATKText"); - } - - /** - * Get the list of walkers currently available to the GATK, organized - * by package. - * @param visibleWalkersOnly If true, return only the walker names that aren't hidden. - * @return Names of currently available walkers. - */ - public Map>> getWalkerNamesByPackage(boolean visibleWalkersOnly) { - Map>> walkersByPackage = new HashMap>>(); - for(Class walker: getPlugins()) { - if(visibleWalkersOnly && isHidden(walker)) - continue; - - // Extract the name for the package; if the walker is in the unnamed package, use the empty string - String walkerPackage = walker.getPackage() != null ? walker.getPackage().getName() : ""; - if(!walkersByPackage.containsKey(walkerPackage)) - walkersByPackage.put(walkerPackage,new ArrayList>()); - walkersByPackage.get(walkerPackage).add(walker); - } - return Collections.unmodifiableMap(walkersByPackage); - } - - /** - * Gets the display name for a given package. - * @param packageName Fully qualified package name. - * @return A suitable display name for the package. - */ - public String getPackageDisplayName(String packageName) { - // ...try to compute the override from the text of the package name, while accounting for - // unpackaged walkers. - String displayName = packageName.substring(packageName.lastIndexOf('.')+1); - if (displayName.trim().equals("")) displayName = ""; - return displayName; - } - - /** - * Gets the help text associated with a given package name. - * @param packageName Package for which to search for help text. - * @return Package help text, or "" if none exists. - */ - public String getPackageSummaryText(String packageName) { - String key = String.format("%s.%s",packageName, ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); - if(!helpText.containsKey(key)) - return ""; - return helpText.getString(key); - } - - /** - * Gets the summary help text associated with a given walker type. - * @param walkerType Type of walker for which to search for help text. - * @return Walker summary description, or "" if none exists. - */ - public String getWalkerSummaryText(Class walkerType) { - String walkerSummary = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.SUMMARY_TAGLET_NAME); - if(!helpText.containsKey(walkerSummary)) - return ""; - return helpText.getString(walkerSummary); - } - - /** - * Gets the summary help text associated with a given walker type. - * @param walker Walker for which to search for help text. - * @return Walker summary description, or "" if none exists. - */ - public String getWalkerSummaryText(Walker walker) { - return getWalkerSummaryText(walker.getClass()); - } - - /** - * Gets the descriptive help text associated with a given walker type. - * @param walkerType Type of walker for which to search for help text. - * @return Walker full description, or "" if none exists. - */ - public String getWalkerDescriptionText(Class walkerType) { - String walkerDescription = String.format("%s.%s",walkerType.getName(), ResourceBundleExtractorDoclet.DESCRIPTION_TAGLET_NAME); - if(!helpText.containsKey(walkerDescription)) - return ""; - return helpText.getString(walkerDescription); - } - - /** - * Gets the descriptive help text associated with a given walker type. - * @param walker Walker for which to search for help text. - * @return Walker full description, or "" if none exists. - */ - public String getWalkerDescriptionText(Walker walker) { - return getWalkerDescriptionText(walker.getClass()); - } - - /** - * Retrieves the walker class given a walker name. - * @param walkerName Name of the walker. - * @return Class representing the walker. - */ - public Class getWalkerClassByName(String walkerName) { - return getPluginsByName().get(walkerName); - } - - /** - * Gets the data source for the provided walker. - * @param walkerClass The class of the walker. - * @return Which type of data source to traverse over...reads or reference? - */ - public static DataSource getWalkerDataSource(Class walkerClass) { - By byDataSource = walkerClass.getAnnotation(By.class); - if( byDataSource == null ) - throw new ReviewedGATKException("Unable to find By annotation for walker class " + walkerClass.getName()); - return byDataSource.value(); - } - - /** - * Gets the data source for the provided walker. - * @param walker The walker. - * @return Which type of data source to traverse over...reads or reference? - */ - public static DataSource getWalkerDataSource(Walker walker) { - return getWalkerDataSource(walker.getClass()); - } - - /** - * Get a list of RODs allowed by the walker. - * @param walkerClass Class of the walker to query. - * @return The list of allowed reference meta data. - */ - public static List getAllowsMetaData(Class walkerClass) { - return Collections.emptyList(); - } - - /** - * Determine whether the given walker supports the given data source. - * @param walkerClass Class of the walker to query. - * @param dataSource Source to check for . - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Class walkerClass, DataSource dataSource) { - Allows allowsDataSource = getWalkerAllowed(walkerClass); - - // Allows is less restrictive than requires. If an allows - // clause is not specified, any kind of data is allowed. - if( allowsDataSource == null ) - return true; - - return Arrays.asList(allowsDataSource.value()).contains(dataSource); - } - - /** - * Determine whether the given walker supports the given data source. - * @param walker Walker to query. - * @param dataSource Source to check for . - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Walker walker, DataSource dataSource) { - return isAllowed(walker.getClass(), dataSource); - } - - /** - * Determine whether the given walker supports the given reference ordered data. - * @param walkerClass Class of the walker to query. - * @param rod Source to check. - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Class walkerClass, ReferenceOrderedDataSource rod) { - return true; - } - - /** - * Determine whether the given walker supports the given reference ordered data. - * @param walker Walker to query. - * @param rod Source to check. - * @return True if the walker forbids this data type. False otherwise. - */ - public static boolean isAllowed(Walker walker, ReferenceOrderedDataSource rod) { - return isAllowed(walker.getClass(), rod); - } - - /** - * Determine whether the given walker requires the given data source. - * @param walkerClass Class of the walker to query. - * @param dataSource Source to check for. - * @return True if the walker allows this data type. False otherwise. - */ - public static boolean isRequired(Class walkerClass, DataSource dataSource) { - Requires requiresDataSource = getWalkerRequirements(walkerClass); - return Arrays.asList(requiresDataSource.value()).contains(dataSource); - } - - /** - * Determine whether the given walker requires the given data source. - * @param walker Walker to query. - * @param dataSource Source to check for. - * @return True if the walker allows this data type. False otherwise. - */ - public static boolean isRequired(Walker walker, DataSource dataSource) { - return isRequired(walker.getClass(), dataSource); - } - - /** - * Get a list of RODs required by the walker. - * @param walkerClass Class of the walker to query. - * @return The list of required reference meta data. - */ - public static List getRequiredMetaData(Class walkerClass) { - return Collections.emptyList(); - } - - /** - * Get a list of RODs required by the walker. - * @param walker Walker to query. - * @return The list of required reference meta data. - */ - public static List getRequiredMetaData(Walker walker) { - return getRequiredMetaData(walker.getClass()); - } - - /** - * Reports whether this walker type is hidden -- in other words, whether it'll appear in the help output. - * @param walkerType Class to test for visibility. - * @return True if the walker should be hidden. False otherwise. - */ - public static boolean isHidden(Class walkerType) { - return walkerType.isAnnotationPresent(Hidden.class); - } - - /** - * Extracts filters that the walker has requested be run on the dataset. - * @param walkerClass Class of the walker to inspect for filtering requests. - * @param filterManager Manages the creation of filters. - * @return A non-empty list of filters to apply to the reads. - */ - public static List getReadFilters(Class walkerClass, FilterManager filterManager) { - List filters = new ArrayList(); - for(Class filterType: getReadFilterTypes(walkerClass)) - filters.add(filterManager.createFilterByType(filterType)); - return filters; - } - - /** - * Extracts filters that the walker has requested be run on the dataset. - * @param walker Walker to inspect for filtering requests. - * @param filterManager Manages the creation of filters. - * @return A non-empty list of filters to apply to the reads. - */ - public static List getReadFilters(Walker walker, FilterManager filterManager) { - return getReadFilters(walker.getClass(), filterManager); - } - - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walker The walker to interrogate. - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod( Walker walker ) { - return getDownsamplingMethod(walker.getClass()); - } - - /** - * Gets the type of downsampling method requested by the walker. If an alternative - * downsampling method is specified on the command-line, the command-line version will - * be used instead. - * @param walkerClass The class of the walker to interrogate. - * @return The downsampling method, as specified by the walker. Null if none exists. - */ - public static DownsamplingMethod getDownsamplingMethod( Class walkerClass ) { - DownsamplingMethod downsamplingMethod = null; - - if( walkerClass.isAnnotationPresent(Downsample.class) ) { - Downsample downsampleParameters = walkerClass.getAnnotation(Downsample.class); - DownsampleType type = downsampleParameters.by(); - Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; - Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type, toCoverage, toFraction); - } - - return downsamplingMethod; - } - - public static T getWalkerAnnotation(final Walker walker, final Class clazz) { - return walker.getClass().getAnnotation(clazz); - } - - public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) { - return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); - } - - /** - * Create a name for this type of walker. - * - * @param walkerType The type of walker. - * @return A name for this type of walker. - */ - @Override - public String getName(Class walkerType) { - String walkerName = ""; - - if (walkerType.getAnnotation(WalkerName.class) != null) - walkerName = ((WalkerName)walkerType.getAnnotation(WalkerName.class)).value().trim(); - else - walkerName = super.getName(walkerType); - - return walkerName; - } - - /** - * Utility to get the requires attribute from the walker. - * Throws an exception if requirements are missing. - * @param walkerClass Class of the walker to query for required data. - * @return Required data attribute. - */ - private static Requires getWalkerRequirements(Class walkerClass) { - Requires requiresDataSource = walkerClass.getAnnotation(Requires.class); - if( requiresDataSource == null ) - throw new ReviewedGATKException( "Unable to find data types required by walker class " + walkerClass.getName()); - return requiresDataSource; - } - - /** - * Utility to get the requires attribute from the walker. - * Throws an exception if requirements are missing. - * @param walker Walker to query for required data. - * @return Required data attribute. - */ - private static Requires getWalkerRequirements(Walker walker) { - return getWalkerRequirements(walker.getClass()); - } - - /** - * Utility to get the forbidden attribute from the walker. - * @param walkerClass Class of the walker to query for required data. - * @return Required data attribute. Null if forbidden info isn't present. - */ - private static Allows getWalkerAllowed(Class walkerClass) { - Allows allowsDataSource = walkerClass.getAnnotation(Allows.class); - return allowsDataSource; - } - - /** - * Utility to get the forbidden attribute from the walker. - * @param walker Walker to query for required data. - * @return Required data attribute. Null if forbidden info isn't present. - */ - private static Allows getWalkerAllowed(Walker walker) { - return getWalkerAllowed(walker.getClass()); - } - - /** - * Gets the list of filtering classes specified as walker annotations. - * @param walkerClass Class of the walker to inspect. - * @return An array of types extending from SamRecordFilter. Will never be null. - */ - public static Collection> getReadFilterTypes(Class walkerClass) { - List> filterTypes = new ArrayList>(); - while(walkerClass != null) { - if(walkerClass.isAnnotationPresent(ReadFilters.class)) { - for ( Class c : walkerClass.getAnnotation(ReadFilters.class).value() ) { - if( !filterTypes.contains(c) ) - filterTypes.add(c); - } - } - walkerClass = walkerClass.getSuperclass(); - } - return filterTypes; - } - - /** - * Gets the list of filtering classes specified as walker annotations. - * @param walker The walker to inspect. - * @return An array of types extending from SamRecordFilter. Will never be null. - */ - public static Collection> getReadFilterTypes(Walker walker) { - return getReadFilterTypes(walker.getClass()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java deleted file mode 100644 index 05834f71b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ /dev/null @@ -1,628 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.arguments; - -import htsjdk.samtools.ValidationStringency; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.engine.samples.PedigreeValidationType; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.baq.BAQ; -import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; - -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -/** - * @author aaron - * @version 1.0 - */ -public class GATKArgumentCollection { - - /** the constructor */ - public GATKArgumentCollection() { - } - - // parameters and their defaults - /** - * An input file containing sequence data mapped to a reference, in SAM or BAM format, or a text file containing a - * list of input files (with extension .list). Note that the GATK requires an accompanying index for each SAM or - * BAM file. Please see our online documentation for more details on input formatting requirements. - */ - @Input(fullName = "input_file", shortName = "I", doc = "Input file containing sequence data (SAM or BAM)", required = false) - public List samFiles = new ArrayList<>(); - - @Hidden - @Argument(fullName = "showFullBamList",doc="Emit a log entry (level INFO) containing the full list of sequence data files to be included in the analysis (including files inside .bam.list files).") - public Boolean showFullBamList = false; - - @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false, minValue = 0) - public Integer readBufferSize = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // GATKRunReport options - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * By default, GATK generates a run report that is uploaded to a cloud-based service. This report contains basic - * statistics about the run (which tool was used, whether the run was successful etc.) that help us for debugging - * and development. Up to version 3.2-2 the run report contains a record of the username and hostname associated - * with the run, but it does **NOT** contain any information that could be used to identify patient data. - * Nevertheless, if your data is subject to stringent confidentiality clauses (no outside communication) or if your - * run environment is not connected to the internet, you can disable the reporting system by seeting this option to - * "NO_ET". You will also need to request a key using the online request form on our website (se FAQs). - */ - @Argument(fullName = "phone_home", shortName = "et", doc="Run reporting mode", required = false) - public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.AWS; - /** - * Please see the "phone_home" argument below and the online documentation FAQs for more details on the key system - * and how to request a key. - */ - @Argument(fullName = "gatk_key", shortName = "K", doc="GATK key file required to run with -et NO_ET", required = false) - public File gatkKeyFile = null; - - /** - * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary tag that can be - * used to group together runs during later analysis. One use of this capability is to tag runs as GATK - * performance tests, so that the performance of the GATK over time can be assessed from the logs directly. - * - * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find - * meaningful. - */ - @Argument(fullName = "tag", shortName = "tag", doc="Tag to identify this GATK run as part of a group of runs", required = false) - public String tag = "NA"; - - // -------------------------------------------------------------------------------------------------------------- - // - // General features - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Reads that fail the specified filters will not be used in the analysis. Multiple filters can be specified separately, - * e.g. you can do -rf MalformedRead -rf BadCigar and so on. Available read filters are listed in the online tool - * documentation. Note that the read name format is e.g. MalformedReadFilter, but at the command line the filter - * name should be given without the Filter suffix; e.g. -rf MalformedRead (NOT -rf MalformedReadFilter, which is not - * recognized by the program). Note also that some read filters are applied by default for some analysis tools; this - * is specified in each tool's documentation. The default filters cannot be disabled. - */ - @Argument(fullName = "read_filter", shortName = "rf", doc = "Filters to apply to reads before analysis", required = false) - public final List readFilters = new ArrayList<>(); - - @ArgumentCollection - public IntervalArgumentCollection intervalArguments = new IntervalArgumentCollection(); - /** - * The reference genome against which the sequence data was mapped. The GATK requires an index file and a dictionary - * file accompanying the reference (please see the online documentation FAQs for more details on these files). Although - * this argument is indicated as being optional, almost all GATK tools require a reference in order to run. - * Note also that while GATK can in theory process genomes from any organism with any number of chromosomes or contigs, - * it is not designed to process draft genome assemblies and performance will decrease as the number of contigs in - * the reference increases. We strongly discourage the use of unfinished genome assemblies containing more than a few - * hundred contigs. Contig numbers in the thousands will most probably cause memory-related crashes. - */ - @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) - public File referenceFile = null; - /** - * If this flag is enabled, the random numbers generated will be different in every run, causing GATK to behave non-deterministically. - */ - @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Use a non-deterministic random seed", required = false) - public boolean nonDeterministicRandomSeed = false; - /** - * To be used in the testing framework where dynamic parallelism can result in differing numbers of calls to the random generator. - */ - @Hidden - @Argument(fullName = "disableDithering",doc="Completely eliminates randomized dithering from rank sum tests.") - public boolean disableDithering = false; - /** - * This will truncate the run but without exiting with a failure. By default the value is interpreted in minutes, but this can be changed with the maxRuntimeUnits argument. - */ - @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="Stop execution cleanly as soon as maxRuntime has been reached", required = false) - public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT; - - @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="Unit of time used by maxRuntime", required = false) - public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES; - - // -------------------------------------------------------------------------------------------------------------- - // - // Downsampling Arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * There are several ways to downsample reads, i.e. to removed reads from the pile of reads that will be used for analysis. - * See the documentation of the individual downsampling options for details on how they work. Note that Many GATK tools - * specify a default downsampling type and target, but this behavior can be overridden from command line using the - * downsampling arguments. - */ - @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of read downsampling to employ at a given locus", required = false) - public DownsampleType downsamplingType = null; - /** - * Reads will be downsampled so the specified fraction remains; e.g. if you specify -dfrac 0.25, three-quarters of - * the reads will be removed, and the remaining one quarter will be used in the analysis. This method of downsampling - * is truly unbiased and random. It is typically used to simulate the effect of generating different amounts of - * sequence data for a given sample. For example, you can use this in a pilot experiment to evaluate how much target - * coverage you need to aim for in order to obtain enough coverage in all loci of interest. - */ - @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction of reads to downsample to", required = false, minValue = 0.0, maxValue = 1.0) - public Double downsampleFraction = null; - - /** - * The principle of this downsampling type is to downsample reads to a given capping threshold coverage. Its purpose is to - * get rid of excessive coverage, because above a certain depth, having additional data is not informative and imposes - * unreasonable computational costs. The downsampling process takes two different forms depending on the type of - * analysis it is used with. - * - * For locus-based traversals (LocusWalkers like UnifiedGenotyper and ActiveRegionWalkers like HaplotypeCaller), - * downsample_to_coverage controls the maximum depth of coverage at each locus. For read-based traversals - * (ReadWalkers like BaseRecalibrator), it controls the maximum number of reads sharing the same alignment start - * position. For ReadWalkers you will typically need to use much lower dcov values than you would with LocusWalkers - * to see an effect. Note that this downsampling option does not produce an unbiased random sampling from all available - * reads at each locus: instead, the primary goal of the to-coverage downsampler is to maintain an even representation - * of reads from all alignment start positions when removing excess coverage. For a truly unbiased random sampling of - * reads, use -dfrac instead. Also note that the coverage target is an approximate goal that is not guaranteed to be - * met exactly: the downsampling algorithm will under some circumstances retain slightly more or less coverage than - * requested. - */ - @Argument(fullName = "downsample_to_coverage", shortName = "dcov", - doc = "Target coverage threshold for downsampling to coverage", - required = false, minValue = 0) - public Integer downsampleCoverage = null; - - /** - * Gets the downsampling method explicitly specified by the user. If the user didn't specify - * a default downsampling mechanism, return the default. - * @return The explicitly specified downsampling mechanism, or the default if none exists. - */ - public DownsamplingMethod getDownsamplingMethod() { - if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) - return null; - - return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction); - } - - /** - * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. - * @param method The downsampling mechanism. - */ - public void setDownsamplingMethod(DownsamplingMethod method) { - if (method == null) - throw new IllegalArgumentException("method is null"); - - downsamplingType = method.type; - downsampleCoverage = method.toCoverage; - downsampleFraction = method.toFraction; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // BAQ arguments - // - // -------------------------------------------------------------------------------------------------------------- - @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) - public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; - /** - * Phred-scaled gap open penalty for BAQ calculation. Although the default value is 40, a value of 30 may be better for whole genome call sets. - */ - @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty", required = false, minValue = 0) - public double BAQGOP = BAQ.DEFAULT_GOP; - - // -------------------------------------------------------------------------------------------------------------- - // - // refactor NDN cigar string arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in - * a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms. - * You should only use this if you know that your reads have that problem. - */ - @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false) - public boolean REFACTOR_NDN_CIGAR_READS = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // quality encoding checking arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * By default the GATK assumes that base quality scores start at Q0 == ASCII 33 according to the SAM specification. - * However, encoding in some datasets (especially older Illumina ones) starts at Q64. This argument will fix the - * encodings on the fly (as the data is read in) by subtracting 31 from every quality score. Note that this argument should - * NEVER be used by default; you should only use it when you have confirmed that the quality scores in your data are - * not in the correct encoding. - */ - @Argument(fullName = "fix_misencoded_quality_scores", shortName="fixMisencodedQuals", doc="Fix mis-encoded base quality scores", required = false) - public boolean FIX_MISENCODED_QUALS = false; - /** - * This flag tells GATK to ignore warnings when encountering base qualities that are too high and that seemingly - * indicate a problem with the base quality encoding of the BAM file. You should only use this if you really know - * what you are doing; otherwise you could seriously mess up your data and ruin your analysis. - */ - @Argument(fullName = "allow_potentially_misencoded_quality_scores", shortName="allowPotentiallyMisencodedQuals", doc="Ignore warnings about base quality score encoding", required = false) - public boolean ALLOW_POTENTIALLY_MISENCODED_QUALS = false; - /** - * This flag tells GATK to use the original base qualities (that were in the data before BQSR/recalibration) which - * are stored in the OQ tag, if they are present, rather than use the post-recalibration quality scores. If no OQ - * tag is present for a read, the standard qual score will be used. - */ - @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "Use the base quality scores from the OQ tag", required=false) - public Boolean useOriginalBaseQualities = false; - /** - * If reads are missing some or all base quality scores, this value will be used for all base quality scores. - * By default this is set to -1 to disable default base quality assignment. - */ - @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "Assign a default base quality", required=false, minValue = 0, maxValue = Byte.MAX_VALUE) - public byte defaultBaseQualities = -1; - - // -------------------------------------------------------------------------------------------------------------- - // - // performance log arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * The file name for the GATK performance log output, or null if you don't want to generate the - * detailed performance logging table. This table is suitable for importing into R or any - * other analysis software that can read tsv files. - */ - @Argument(fullName = "performanceLog", shortName="PF", doc="Write GATK runtime performance log to this file", required = false) - public File performanceLog = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // BQSR arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Enables on-the-fly recalibrate of base qualities, intended primarily for use with BaseRecalibrator and PrintReads - * (see Best Practices workflow documentation). The covariates tables are produced by the BaseRecalibrator tool. - * Please be aware that you should only run recalibration with the covariates file created on the same input bam(s). - */ - @Input(fullName="BQSR", shortName="BQSR", required=false, doc="Input covariates table file for on-the-fly base quality score recalibration") - public File BQSR_RECAL_FILE = null; - - /** - * Turns on the base quantization module. It requires a recalibration report (-BQSR). - * - * A value of 0 here means "do not quantize". - * Any value greater than zero will be used to recalculate the quantization using that many levels. - * Negative values mean that we should quantize using the recalibration report's quantization level. - */ - @Hidden - @Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false) - public int quantizationLevels = 0; - - /** - * Turns off printing of the base insertion and base deletion tags when using the -BQSR argument. Only the base substitution qualities will be produced. - */ - @Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "Disable printing of base insertion and deletion tags (with -BQSR)", required=false) - public boolean disableIndelQuals = false; - - /** - * By default, the OQ tag in not emitted when using the -BQSR argument. Use this flag to include OQ tags in the output BAM file. - * Note that this may results in significant file size increase. - */ - @Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "Emit the OQ tag with the original base qualities (with -BQSR)", required=false) - public boolean emitOriginalQuals = false; - - /** - * This flag tells GATK not to modify quality scores less than this value. Instead they will be written out unmodified in the recalibrated BAM file. - * In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases. - * For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself, - * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect, - * your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream. - */ - @Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)", required = false, minValue = 0, minRecommendedValue = QualityUtils.MIN_USABLE_Q_SCORE) - public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE; - /** - * If specified, this value will be used as the prior for all mismatch quality scores instead of the actual reported quality score. - */ - @Argument(fullName = "globalQScorePrior", shortName = "globalQScorePrior", doc = "Global Qscore Bayesian prior to use for BQSR", required = false) - public double globalQScorePrior = -1.0; - - - // -------------------------------------------------------------------------------------------------------------- - // - // Other utility arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Keep in mind that if you set this to LENIENT, we may refuse to provide you with support if anything goes wrong. - */ - @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) - public ValidationStringency strictnessLevel = ValidationStringency.SILENT; - /** - * Some tools keep program records in the SAM header by default. Use this argument to override that behavior and discard program records for the SAM header. - */ - @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Remove program records from the SAM header", required = false) - public boolean removeProgramRecords = false; - /** - * Some tools discard program records from the SAM header by default. Use this argument to override that behavior and keep program records in the SAM header. - */ - @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Keep program records in the SAM header", required = false) - public boolean keepProgramRecords = false; - - /** - * On-the-fly sample renaming works only with single-sample BAM and VCF files. Each line of the mapping file must - * contain the absolute path to a BAM or VCF file, followed by whitespace, followed by the new sample name for that - * BAM or VCF file. The sample name may contain non-tab whitespace, but leading or trailing whitespace will be - * ignored. The engine will verify at runtime that each BAM/VCF targeted for sample renaming has only a single - * sample specified in its header (though, in the case of BAM files, there may be multiple read groups for that - * sample). - */ - @Advanced - @Argument(fullName = "sample_rename_mapping_file", shortName = "sample_rename_mapping_file", doc = "Rename sample IDs on-the-fly at runtime using the provided mapping file", required = false) - public File sampleRenameMappingFile = null; - - /** - * For expert users only who know what they are doing. We do not support usage of this argument, so we may refuse to help you if you use it and something goes wrong. The one exception to this rule is ALLOW_N_CIGAR_READS, which is necessary for RNAseq analysis. - */ - @Argument(fullName = "unsafe", shortName = "U", doc = "Enable unsafe operations: nothing will be checked at runtime", required = false) - public ValidationExclusion.TYPE unsafe; - /** - * UNSAFE FOR GENERAL USE (FOR TEST SUITE USE ONLY). Disable both auto-generation of index files and index file locking - * when reading VCFs and other rods and an index isn't present or is out-of-date. The file locking necessary for auto index - * generation to work safely is prone to random failures/hangs on certain platforms, which makes it desirable to disable it - * for situations like test suite runs where the indices are already known to exist, however this option is unsafe in general - * because it allows reading from index files without first acquiring a lock. - */ - @Hidden - @Advanced - @Argument(fullName = "disable_auto_index_creation_and_locking_when_reading_rods", shortName = "disable_auto_index_creation_and_locking_when_reading_rods", - doc = "Disable both auto-generation of index files and index file locking", - required = false) - public boolean disableAutoIndexCreationAndLockingWhenReadingRods = false; - - @Hidden - @Argument(fullName = "no_cmdline_in_header", shortName = "no_cmdline_in_header", doc = "Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", - required = false) - public boolean disableCommandLineInVCF = false; - - @Argument(fullName = "sites_only", shortName = "sites_only", doc = "Just output sites without genotypes (i.e. only the first 8 columns of the VCF)", - required = false) - public boolean sitesOnlyVCF = false; - - /** - *

The VCF specification permits missing records to be dropped from the end of FORMAT fields, so long as GT is always output. - * This option prevents GATK from performing that trimming.

- * - *

For example, given a FORMAT of

GT:AD:DP:PL
, GATK will by default emit
./.
for a variant with - * no reads present (ie, the AD, DP, and PL fields are trimmed). If you specify -writeFullFormat, this record - * would be emitted as
./.:.:.:.

- */ - @Argument(fullName = "never_trim_vcf_format_field", shortName = "writeFullFormat", doc = "Always output all the records in VCF FORMAT fields, even if some are missing", - required = false) - public boolean neverTrimVCFFormatField = false; - - @Hidden - @Argument(fullName = "bcf", shortName = "bcf", doc = "Force BCF output, regardless of the file's extension", - required = false) - public boolean forceBCFOutput = false; - - @Advanced - @Argument(fullName = "bam_compression", shortName = "compress", doc = "Compression level to use for writing BAM files (0 - 9, higher is more compressed)", - minValue = 0, maxValue = 9, required = false) - public Integer bamCompression = null; - - @Advanced - @Argument(fullName = "simplifyBAM", shortName = "simplifyBAM", - doc = "If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier", - required = false) - public boolean simplifyBAM = false; - - @Argument(fullName = "disable_bam_indexing", doc = "Turn off on-the-fly creation of indices for output BAM files.", - required = false) - public boolean disableBAMIndexing = false; - - @Argument(fullName = "generate_md5", doc = "Enable on-the-fly creation of md5s for output BAM files.", - required = false) - public boolean enableBAMmd5 = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // Multi-threading arguments - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Data threads contains N cpu threads per data thread, and act as completely data parallel processing, increasing - * the memory usage of GATK by M data threads. Data threads generally scale extremely effectively, up to 24 cores. - * See online documentation FAQs for more information. - */ - @Argument(fullName = "num_threads", shortName = "nt", doc = "Number of data threads to allocate to this analysis", required = false, minValue = 1) - public Integer numberOfDataThreads = 1; - - /** - * Each CPU thread operates the map cycle independently, but may run into earlier scaling problems with IO than - * data threads. Has the benefit of not requiring X times as much memory per thread as data threads do, but rather - * only a constant overhead. See online documentation FAQs for more information. - */ - @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="Number of CPU threads to allocate per data thread", required = false, minValue = 1) - public int numberOfCPUThreadsPerDataThread = 1; - - @Argument(fullName="num_io_threads", shortName = "nit", doc="Number of given threads to allocate to IO", required = false, minValue = 0) - @Hidden - public int numberOfIOThreads = 0; - - /** - * Enable GATK to monitor its own threading efficiency, at an itsy-bitsy tiny - * cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for - * debugging purposes. Note that this argument is not compatible with -nt, it only works with -nct. - */ - @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable threading efficiency monitoring", required = false) - public Boolean monitorThreadEfficiency = false; - - @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="Total number of BAM file handles to keep open simultaneously", required=false, minValue = 1) - public Integer numberOfBAMFileHandles = null; - /** - * This will filter out read groups matching : (e.g. SM:sample1) or a .txt file containing the filter strings one per line. - */ - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Exclude read groups based on tags", required = false) - public List readGroupBlackList = null; - - // -------------------------------------------------------------------------------------------------------------- - // - // PED (pedigree) support - // - // -------------------------------------------------------------------------------------------------------------- - - /** - *

Reads PED file-formatted tabular text files describing meta-data about the samples being - * processed in the GATK.

- * - * - * - *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

- * - *
    - *
  • Family ID
  • - *
  • Individual ID
  • - *
  • Paternal ID
  • - *
  • Maternal ID
  • - *
  • Sex (1=male; 2=female; other=unknown)
  • - *
  • Phenotype
  • - *
- * - *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. - * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a - * quantitative trait or an affection status column: GATK will automatically detect which type - * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

- * - *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

- * - *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that - * line will be ignored. Do not start any family IDs with this character therefore.

- * - *

Affection status should be coded:

- * - *
    - *
  • -9 missing
  • - *
  • 0 missing
  • - *
  • 1 unaffected
  • - *
  • 2 affected
  • - *
- * - *

If any value outside of -9,0,1,2 is detected than the samples are assumed - * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely - * represents the missing value.

- * - *

Genotypes (column 7 onwards) cannot be specified to the GATK.

- * - *

For example, here are two individuals (one row = one person):

- * - *
-     *   FAM001  1  0 0  1  2
-     *   FAM001  2  0 0  1  2
-     * 
- * - *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to - * tell the GATK PED parser that the corresponding fields are missing from the ped file.

- * - *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree - * data should clearly indicate so in their arguments and will throw errors if required pedigree - * information is missing.

- */ - @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) - public List pedigreeFiles = Collections.emptyList(); - - /** - * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more - * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString - * as -ped supports - */ - @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) - public List pedigreeStrings = Collections.emptyList(); - - /** - * How strict should we be in parsing the PED files? - */ - @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="Validation strictness for pedigree information",required=false) - public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; - - // -------------------------------------------------------------------------------------------------------------- - // - // BAM indexing and sharding arguments - // - // -------------------------------------------------------------------------------------------------------------- - /** - * NO INTEGRATION TESTS are available. Use at your own risk. - */ - @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM",required=false) - @Hidden - public boolean allowIntervalsWithUnindexedBAM = false; - - // -------------------------------------------------------------------------------------------------------------- - // - // testing BCF2 - // - // -------------------------------------------------------------------------------------------------------------- - /** - * If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes. - */ - @Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="Write a BCF copy of the output VCF",required=false) - @Hidden - public boolean generateShadowBCF = false; - // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - // -------------------------------------------------------------------------------------------------------------- - // - // VCF/BCF index parameters - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Specify the Tribble indexing strategy to use for VCFs. - * - * LINEAR creates a LinearIndex with bins of equal width, specified by the Bin Width parameter - * INTERVAL creates an IntervalTreeIndex with bins with an equal amount of features, specified by the Features Per Bin parameter - * DYNAMIC_SEEK attempts to optimize for minimal seek time by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - * DYNAMIC_SIZE attempts to optimize for minimal index size by choosing an appropriate strategy and parameter (user-supplied parameter is ignored) - */ - @Argument(fullName="variant_index_type",shortName = "variant_index_type",doc="Type of IndexCreator to use for VCF/BCF indices",required=false) - @Advanced - public GATKVCFIndexType variant_index_type = GATKVCFUtils.DEFAULT_INDEX_TYPE; - /** - * This is either the bin width or the number of features per bin, depending on the indexing strategy - */ - @Argument(fullName="variant_index_parameter",shortName = "variant_index_parameter",doc="Parameter to pass to the VCF/BCF IndexCreator",required=false) - @Advanced - public int variant_index_parameter = GATKVCFUtils.DEFAULT_INDEX_PARAMETER; -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/ValidationExclusion.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/ValidationExclusion.java deleted file mode 100644 index ccd4fdc44..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/arguments/ValidationExclusion.java +++ /dev/null @@ -1,67 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.arguments; - -import org.broadinstitute.gatk.utils.commandline.EnumerationArgumentDefault; - -import java.util.ArrayList; -import java.util.List; - - -public class ValidationExclusion { - - // our validation options - - public enum TYPE { - ALLOW_N_CIGAR_READS, // ignore the presence of N operators in CIGARs: do not blow up and process reads that contain one or more N operators. - // This exclusion does not have effect on reads that get filtered {@see MalformedReadFilter}. - ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard - ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set - NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file - ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities - LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. - @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL - ALL // do not check for all of the above conditions, DEFAULT - } - - // a storage for the passed in exclusions - List exclusions = new ArrayList(); - - public ValidationExclusion(List exclusionsList) { - exclusions.addAll(exclusionsList); - } - - public ValidationExclusion() {} - - /** - * do we contain the exclusion specified, or were we set to ALL - * @param t the exclusion case to test for - * @return true if we contain the exclusion or if we're set to ALL, false otherwise - */ - public boolean contains(TYPE t) { - return (exclusions.contains(TYPE.ALL) || exclusions.contains(t)); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContext.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContext.java deleted file mode 100644 index 6ac204865..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContext.java +++ /dev/null @@ -1,154 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.contexts; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.List; - -/** - * Useful class for forwarding on locusContext data from this iterator - * - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 3:01:34 PM - * To change this template use File | Settings | File Templates. - */ -public class AlignmentContext implements HasGenomeLocation { - protected GenomeLoc loc = null; - protected ReadBackedPileup basePileup = null; - protected boolean hasPileupBeenDownsampled; - - /** - * The number of bases we've skipped over in the reference since the last map invocation. - * Only filled in by RodTraversals right now. By default, nothing is being skipped, so skippedBases == 0. - */ - private long skippedBases = 0; - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup) { - this(loc, basePileup, 0, false); - } - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, boolean hasPileupBeenDownsampled) { - this(loc, basePileup, 0, hasPileupBeenDownsampled); - } - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, long skippedBases) { - this(loc, basePileup, skippedBases, false); - } - - public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, long skippedBases,boolean hasPileupBeenDownsampled ) { - if ( loc == null ) throw new ReviewedGATKException("BUG: GenomeLoc in Alignment context is null"); - if ( basePileup == null ) throw new ReviewedGATKException("BUG: ReadBackedPileup in Alignment context is null"); - if ( skippedBases < 0 ) throw new ReviewedGATKException("BUG: skippedBases is -1 in Alignment context"); - - this.loc = loc; - this.basePileup = basePileup; - this.skippedBases = skippedBases; - this.hasPileupBeenDownsampled = hasPileupBeenDownsampled; - } - - /** Returns base pileup over the current genomic location. Deprectated. Use getBasePileup() to make your intentions - * clear. - * @return - */ - @Deprecated - public ReadBackedPileup getPileup() { return basePileup; } - - /** Returns base pileup over the current genomic location. May return null if this context keeps only - * extended event (indel) pileup. - * @return - */ - public ReadBackedPileup getBasePileup() { - return basePileup; - } - - /** - * Returns true if any reads have been filtered out of the pileup due to excess DoC. - * @return True if reads have been filtered out. False otherwise. - */ - public boolean hasPileupBeenDownsampled() { return hasPileupBeenDownsampled; } - - /** - * get all of the reads within this context - * - * @return - */ - @Deprecated - //todo: unsafe and tailored for current usage only; both pileups can be null or worse, bot can be not null in theory - public List getReads() { return ( basePileup.getReads() ); } - - /** - * Are there any reads associated with this locus? - * - * @return - */ - public boolean hasReads() { - return basePileup != null && basePileup.getNumberOfElements() > 0 ; - } - - /** - * How many reads cover this locus? - * @return - */ - public int size() { - return basePileup.getNumberOfElements(); - } - - /** - * get a list of the equivalent positions within in the reads at Pos - * - * @return - */ - @Deprecated - public List getOffsets() { - return basePileup.getOffsets(); - } - - public String getContig() { return getLocation().getContig(); } - public long getPosition() { return getLocation().getStart(); } - public GenomeLoc getLocation() { return loc; } - - public void downsampleToCoverage(int coverage) { - basePileup = basePileup.getDownsampledPileup(coverage); - hasPileupBeenDownsampled = true; - } - - /** - * Returns the number of bases we've skipped over in the reference since the last map invocation. - * Only filled in by RodTraversals right now. A value of 0 indicates that no bases were skipped. - * - * @return the number of skipped bases - */ - public long getSkippedBases() { - return skippedBases; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContextUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContextUtils.java deleted file mode 100644 index afeb1e735..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/AlignmentContextUtils.java +++ /dev/null @@ -1,150 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.contexts; - -import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.pileup.*; - -import java.util.*; - -/** - * Useful utilities for storing different AlignmentContexts - * User: ebanks - */ -public class AlignmentContextUtils { - - // Definitions: - // COMPLETE = full alignment context - // FORWARD = reads on forward strand - // REVERSE = reads on forward strand - // - public enum ReadOrientation { COMPLETE, FORWARD, REVERSE } - - private AlignmentContextUtils() { - // cannot be instantiated - } - - /** - * Returns a potentially derived subcontext containing only forward, reverse, or in fact all reads - * in alignment context context. - * - * @param context - * @param type - * @return - */ - public static AlignmentContext stratify(AlignmentContext context, ReadOrientation type) { - switch(type) { - case COMPLETE: - return context; - case FORWARD: - return new AlignmentContext(context.getLocation(),context.getPileup().getPositiveStrandPileup()); - case REVERSE: - return new AlignmentContext(context.getLocation(),context.getPileup().getNegativeStrandPileup()); - default: - throw new ReviewedGATKException("Unable to get alignment context for type = " + type); - } - } - - public static Map splitContextBySampleName(AlignmentContext context) { - return splitContextBySampleName(context, null); - } - - /** - * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead - * of sample object. - * - * @param context the original pileup - * - * @return a Map of sample name to StratifiedAlignmentContext - * - **/ - public static Map splitContextBySampleName(AlignmentContext context, String assumedSingleSample) { - GenomeLoc loc = context.getLocation(); - HashMap contexts = new HashMap(); - - for(String sample: context.getPileup().getSamples()) { - ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample); - - // Don't add empty pileups to the split context. - if(pileupBySample.getNumberOfElements() == 0) - continue; - - if(sample != null) - contexts.put(sample, new AlignmentContext(loc, pileupBySample)); - else { - if(assumedSingleSample == null) { - throw new UserException.ReadMissingReadGroup(pileupBySample.iterator().next().getRead()); - } - contexts.put(assumedSingleSample,new AlignmentContext(loc, pileupBySample)); - } - } - - return contexts; - } - - /** - * Splits the AlignmentContext into one context per read group - * - * @param context the original pileup - * @return a Map of ReadGroup to AlignmentContext, or an empty map if context has no base pileup - * - **/ - public static Map splitContextByReadGroup(AlignmentContext context, Collection readGroups) { - HashMap contexts = new HashMap(); - - for (SAMReadGroupRecord rg : readGroups) { - ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId()); - if ( rgPileup != null ) // there we some reads for RG - contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup)); - } - - return contexts; - } - - public static Map splitContextBySampleName(ReadBackedPileup pileup) { - return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup)); - } - - - public static AlignmentContext joinContexts(Collection contexts) { - // validation - GenomeLoc loc = contexts.iterator().next().getLocation(); - for(AlignmentContext context: contexts) { - if(!loc.equals(context.getLocation())) - throw new ReviewedGATKException("Illegal attempt to join contexts from different genomic locations"); - } - - List pe = new ArrayList(); - for(AlignmentContext context: contexts) { - for(PileupElement pileupElement: context.basePileup) - pe.add(pileupElement); - } - return new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe)); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/ReferenceContext.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/ReferenceContext.java deleted file mode 100644 index 201ea49fd..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/contexts/ReferenceContext.java +++ /dev/null @@ -1,217 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.contexts; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -/** - * The section of the reference that overlaps with the given - * read / locus. - * - * @author hanna - * @version 0.1 - */ -public class ReferenceContext { - /** - * Facilitates creation of new GenomeLocs. - */ - final private GenomeLocParser genomeLocParser; - - /** - * The locus. - */ - final private GenomeLoc locus; - - /** - * The window of reference information around the current locus. - */ - final private GenomeLoc window; - - /** - * The bases in the window around the current locus. If null, then bases haven't been fetched yet. - * Bases are always upper cased - */ - private byte[] basesCache = null; - - /** - * Lazy loader to fetch reference bases - */ - final private ReferenceContextRefProvider basesProvider; - - /** - * Interface to create byte[] contexts for lazy loading of the reference - */ - public static interface ReferenceContextRefProvider { - /** - * You must provide a routine that gets the byte[] bases that would have been passed into the - * ReferenceContext. The RC will handling caching. The value of this interface and routine is - * that it is only called when the bytes are actually requested by the walker, not up front. So - * if the walker doesn't need the refBases for whatever reason, there's no overhead to - * provide them. - * - * @return - */ - @Ensures({"result != null"}) - public byte[] getBases(); - } - - private static class ForwardingProvider implements ReferenceContextRefProvider { - byte[] bases; - - public ForwardingProvider( byte base ) { - this(new byte[] { base }); - } - - public ForwardingProvider( byte[] bases ) { - this.bases = bases; - } - - public byte[] getBases() { return bases; } - } - - /** - * Contructor for a simple, windowless reference context. - * @param locus locus of interest. - * @param base reference base at that locus. - */ - @Requires({ - "genomeLocParser != null", - "locus != null", - "locus.size() > 0"}) - public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, byte base ) { - this( genomeLocParser, locus, locus, new ForwardingProvider(base) ); - } - - @Requires({ - "genomeLocParser != null", - "locus != null", - "locus.size() > 0", - "window != null", - "window.size() > 0", - "bases != null && bases.length > 0"}) - public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, GenomeLoc window, byte[] bases ) { - this( genomeLocParser, locus, window, new ForwardingProvider(bases) ); - } - - @Requires({ - "genomeLocParser != null", - "locus != null", - "locus.size() > 0", - "window != null", - "window.size() > 0", - "basesProvider != null"}) - public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, GenomeLoc window, ReferenceContextRefProvider basesProvider ) { - this.genomeLocParser = genomeLocParser; - this.locus = locus; - this.window = window; - this.basesProvider = basesProvider; - } - - /** - * Utility function to load bases from the provider to the cache, if necessary - */ - @Ensures({ - "basesCache != null", - "old(basesCache) == null || old(basesCache) == basesCache"}) - private void fetchBasesFromProvider() { - if ( basesCache == null ) { - basesCache = basesProvider.getBases(); - - // must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time - assert BaseUtils.isUpperCase(basesCache); - } - } - - /** - * @return The genome loc parser associated with this reference context - */ - @Ensures("result != null") - public GenomeLocParser getGenomeLocParser() { - return genomeLocParser; - } - - /** - * The locus currently being examined. - * @return The current locus. - */ - @Ensures("result != null") - public GenomeLoc getLocus() { - return locus; - } - - @Ensures("result != null") - public GenomeLoc getWindow() { - return window; - } - - /** - * Get the base at the given locus. - * @return The base at the given locus from the reference. - */ - public byte getBase() { - return getBases()[(locus.getStart() - window.getStart())]; - } - - /** - * All the bases in the window currently being examined. - * @return All bases available. If the window is of size [0,0], the array will - * contain only the base at the given locus. - */ - @Ensures({"result != null", "result.length > 0"}) - public byte[] getBases() { - fetchBasesFromProvider(); - return basesCache; - } - - /** - * All the bases in the window from the current base forward to the end of the window. - */ - @Ensures({"result != null", "result.length > 0"}) - public byte[] getForwardBases() { - final byte[] bases = getBases(); - final int mid = locus.getStart() - window.getStart(); - // todo -- warning of performance problem, especially if this is called over and over - return new String(bases).substring(mid).getBytes(); - } - - @Deprecated - public char getBaseAsChar() { - return (char)getBase(); - } - - /** - * Get the base at the given locus. - * @return The base at the given locus from the reference. - */ - @Deprecated() - public int getBaseIndex() { - return BaseUtils.simpleBaseToBaseIndex(getBase()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java deleted file mode 100644 index 56ecce2ef..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusView.java +++ /dev/null @@ -1,169 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.iterators.GenomeLocusIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Collections; -import java.util.List; -import java.util.NoSuchElementException; -/** - * User: hanna - * Date: May 13, 2009 - * Time: 3:32:30 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A LocusView over which the user can iterate. - */ - -public class AllLocusView extends LocusView { - private GenomeLocusIterator locusIterator; - - /** - * Gets the next position in the view: next call to next() will jump there. - * Note that both nextPosition and nextLocus are PRE-read and cached. - */ - private GenomeLoc nextPosition = null; - - /** - * What's the next available context? - */ - private AlignmentContext nextLocus = null; - - /** - * Signal not to advance the iterator because we're currently sitting at the next element. - */ - private boolean atNextElement = false; - - /** - * Create a new queue of locus contexts. - * - * @param provider - */ - public AllLocusView(LocusShardDataProvider provider) { - super(provider); - // Seed the state tracking members with the first possible seek position and the first possible locus context. - locusIterator = new GenomeLocusIterator(genomeLocParser, provider.getLocus()); - } - - public boolean hasNext() { - advance(); - return nextPosition != null; - } - - public AlignmentContext next() { - advance(); - - if (nextPosition == null) - throw new NoSuchElementException("No next is available in the all locus view"); - - // Flag to the iterator that no data is waiting in the queue to be processed. - atNextElement = false; - - AlignmentContext currentLocus; - - // If actual data is present, return it. Otherwise, return empty data. - if (nextLocus != null && nextLocus.getLocation().equals(nextPosition)) - currentLocus = nextLocus; - else - currentLocus = createEmptyLocus(nextPosition); - - return currentLocus; - } - - private void advance() { - // Already at the next element? Don't move forward. - if (atNextElement) - return; - - // Out of elements? - if (nextPosition == null && !locusIterator.hasNext()) - return; - - // If nextLocus has been consumed, clear it out to make room for the next incoming locus. - if (nextPosition != null && nextLocus != null && !nextLocus.getLocation().isPast(nextPosition)) { - nextLocus = null; - - // Determine the next locus. The trick is that we may have more than one alignment context at the same - // reference position (regular base pileup, then extended pileup). If next alignment context (that we just pre-read) - // is still at the current position, we do not increment current position and wait for next call to next() to return - // that context. If we know that next context is past the current position, we are done with current - // position - if (hasNextLocus()) { - nextLocus = nextLocus(); - if (nextPosition.equals(nextLocus.getLocation())) { - atNextElement = true; - return; - } - } - } - - // No elements left in queue? Clear out the position state tracker and return. - if (!locusIterator.hasNext()) { - nextPosition = null; - return; - } - - // Actually fill the next position. - nextPosition = locusIterator.next(); - atNextElement = true; - - // Crank the iterator to (if possible) or past the next context. Be careful not to hold a reference to nextLocus - // while using the hasNextLocus() / nextLocus() machinery; this will cause us to use more memory than is optimal. - while (nextLocus == null || nextLocus.getLocation().isBefore(nextPosition)) { - nextLocus = null; - if (!hasNextLocus()) - break; - nextLocus = nextLocus(); - } - } - - /** - * Creates a blank locus context at the specified location. - * - * @param site Site at which to create the blank locus context. - * @return empty context. - */ - private final static List EMPTY_PILEUP_READS = Collections.emptyList(); - private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList(); - private final static List EMPTY_DELETION_STATUS = Collections.emptyList(); - - private AlignmentContext createEmptyLocus(GenomeLoc site) { - return new AlignmentContext(site, new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java deleted file mode 100644 index 900612a49..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusView.java +++ /dev/null @@ -1,63 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -/** - * User: hanna - * Date: May 12, 2009 - * Time: 11:24:42 AM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A queue of locus contexts. Provides unidirectional seek. Stripped down - * implementation of java.util.Queue interface. - */ - -public class CoveredLocusView extends LocusView { - /** - * Create a new queue of locus contexts. - * @param provider - */ - public CoveredLocusView(LocusShardDataProvider provider) { - super(provider); - } - - public boolean hasNext() { - return hasNextLocus(); - } - - public AlignmentContext next() { - return nextLocus(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java deleted file mode 100644 index 9100905f3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalOverlappingRODsFromStream.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.util.PeekableIterator; -import org.broadinstitute.gatk.engine.refdata.RODRecordListImpl; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; - -import java.util.Collection; -import java.util.LinkedList; -import java.util.ListIterator; - -/** - * Key algorithmic helper for ReadBasedReferenceOrderedData - * - * Takes a single iterator of features, and provides a single capability that returns - * the list of RODs that overlap an interval. Allows sequential getOverlapping calls - * from intervals provided that these intervals always have increasing getStart() values. - * - */ -class IntervalOverlappingRODsFromStream { - /** - * Only held for QC purposes - */ - GenomeLoc lastQuery = null; - - private final String name; - private final LinkedList currentFeatures = new LinkedList(); - private final PeekableIterator futureFeatures; - - /** - * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and - * returns RODRecordLists having name - * - * @param name - * @param futureFeatures - */ - IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) { - if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null"); - - this.name = name; - this.futureFeatures = futureFeatures; - } - - /** - * Get the list of RODs overlapping loc from this stream of RODs. - * - * @param loc the interval to query - * @return a non-null RODRecordList containing the overlapping RODs, which may be empty - */ - @Ensures({"overlaps(loc, result)", - "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)", - "result != null"}) - public RODRecordList getOverlapping(final GenomeLoc loc) { - if ( lastQuery != null && loc.getStart() < lastQuery.getStart() ) - throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery)); - - readOverlappingFutureFeatures(loc); - return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc); - } - - - /** - * For contract assurance. Checks that all bindings in loc overlap - * - * @param loc - * @param bindings - * @return - */ - @Requires({"loc != null", "bindings != null"}) - private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) { - for ( final GATKFeature feature : bindings ) - if ( ! feature.getLocation().overlapsP(loc) ) - return false; - return true; - } - - /** - * Subset the features in all to those that overlap with loc - * - * The current features list contains everything read that cannot be thrown away yet, but not - * everything in there necessarily overlaps with loc. Subset to just those that do overlap - * - * @param loc the location that features must overlap - * @param all the list of all features - * @return a subset of all that overlaps with loc - */ - @Requires({"loc != null", "all != null"}) - @Ensures("result.size() <= all.size()") - private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) { - final LinkedList overlapping = new LinkedList(); - for ( final GATKFeature feature : all ) - if ( feature.getLocation().overlapsP(loc) ) - overlapping.add(feature); - return overlapping; - } - - /** - * Update function. Remove all elements of currentFeatures that end before loc - * - * Must be called by clients periodically when they know they they will never ask for data before - * loc, so that the running cache of RODs doesn't grow out of control. - * - * @param loc the location to use - */ - @Requires("loc != null") - @Ensures("currentFeatures.size() <= old(currentFeatures.size())") - public void trimCurrentFeaturesToLoc(final GenomeLoc loc) { - final ListIterator it = currentFeatures.listIterator(); - while ( it.hasNext() ) { - final GATKFeature feature = it.next(); - if ( feature.getLocation().isBefore(loc) ) - it.remove(); - } - } - - /** - * Update function: Read all elements from futureFeatures that overlap with loc - * - * Stops at the first element that starts before the end of loc, or the stream empties - * - * @param loc - */ - @Requires("loc != null") - @Ensures("currentFeatures.size() >= old(currentFeatures.size())") - private void readOverlappingFutureFeatures(final GenomeLoc loc) { - while ( futureFeatures.hasNext() ) { - final GenomeLoc nextLoc = futureFeatures.peek().getLocation(); - if ( nextLoc.isBefore(loc) ) { - futureFeatures.next(); // next rod element is before loc, throw it away and keep looking - } else if ( nextLoc.isPast(loc) ) { - break; // next element is past loc, stop looking but don't pop it - } else if ( nextLoc.overlapsP(loc) ) { - // add overlapping elements to our current features, removing from stream - for ( final GATKFeature feature : futureFeatures.next() ) { - currentFeatures.add(feature); - } - } - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java deleted file mode 100644 index 23f4f73e8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedView.java +++ /dev/null @@ -1,184 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.util.PeekableIterator; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.reads.ReadShard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * a ROD view that allows for requests for RODs that overlap intervals on the genome to produce a RefMetaDataTracker - */ -public class IntervalReferenceOrderedView implements ReferenceOrderedView { - /** a list of the RMDDataState (location->iterators) */ - private final List states = new ArrayList<>(1); - - /** - * Used to get genome locs for reads - */ - protected final GenomeLocParser genomeLocParser; - - /** - * The total extent of all reads in this span. We create iterators from our RODs - * from the start of this span, to the end. - */ - private final GenomeLoc shardSpan; - - /** - * Create a new IntervalReferenceOrderedView taking data from provider and capable of - * servicing ROD overlap requests within the genomic interval span - * - * @param provider a ShardDataProvider to give us data - * @param span a GenomeLoc span, or null indicating take the entire genome - */ - public IntervalReferenceOrderedView(final ShardDataProvider provider, final GenomeLoc span) { - if ( provider == null ) throw new IllegalArgumentException("provider cannot be null"); - if ( provider.hasReferenceOrderedData() && span == null ) throw new IllegalArgumentException("span cannot be null when provider has reference ordered data"); - - this.genomeLocParser = provider.getGenomeLocParser(); - this.shardSpan = span; - provider.register(this); - - // conditional to optimize the case where we don't have any ROD data - if ( provider.hasReferenceOrderedData() && ! shardSpan.isUnmapped() ) { - for (final ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData()) - states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan))); - } - } - - /** - * Testing constructor - */ - protected IntervalReferenceOrderedView(final GenomeLocParser genomeLocParser, - final GenomeLoc shardSpan, - final List names, - final List> featureSources) { - this.genomeLocParser = genomeLocParser; - this.shardSpan = shardSpan; - for ( int i = 0; i < names.size(); i++ ) - states.add(new RMDDataState(names.get(i), featureSources.get(i))); - } - - public Collection> getConflictingViews() { - List> classes = new ArrayList<>(); - classes.add(ManagingReferenceOrderedView.class); - return classes; - } - - /** - * Get a RefMetaDataTracker containing bindings for all RODs overlapping the start position of loc - * @param loc a GenomeLoc of size == 1 - * @return a non-null RefMetaDataTracker - */ - @Override - public RefMetaDataTracker getReferenceOrderedDataAtLocus(GenomeLoc loc) { - if ( loc == null ) throw new IllegalArgumentException("loc cannot be null"); - if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); - return getReferenceOrderedDataForInterval(loc); - } - - /** - * Get a RefMetaDataTracker containing bindings for all RODs overlapping interval - * - * @param interval a non=null interval - * @return a non-null RefMetaDataTracker - */ - public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) { - if ( interval == null ) throw new IllegalArgumentException("Interval cannot be null"); - - if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers) - return RefMetaDataTracker.EMPTY_TRACKER; - else { - final List bindings = new ArrayList<>(states.size()); - for ( final RMDDataState state : states ) - bindings.add(state.stream.getOverlapping(interval)); - return new RefMetaDataTracker(bindings); - } - } - - /** - * Trim down all of the ROD managers so that they only hold ROD bindings wit start >= startOfDataToKeep.getStart() - * - * @param startOfDataToKeep a non-null genome loc - */ - public void trimCurrentFeaturesToLoc(final GenomeLoc startOfDataToKeep) { - if ( startOfDataToKeep == null ) throw new IllegalArgumentException("startOfDataToKeep cannot be null"); - - for ( final RMDDataState state : states ) - state.stream.trimCurrentFeaturesToLoc(startOfDataToKeep); - } - - /** - * Closes the current view. - */ - public void close() { - for (final RMDDataState state : states) - state.close(); - - // Clear out the existing data so that post-close() accesses to this data will fail-fast. - states.clear(); - } - - /** - * Models the traversal state of a given ROD lane. - */ - private static class RMDDataState { - public final ReferenceOrderedDataSource dataSource; - public final IntervalOverlappingRODsFromStream stream; - private final LocationAwareSeekableRODIterator iterator; - - public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { - this.dataSource = dataSource; - this.iterator = iterator; - this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<>(iterator)); - } - - /** - * For testing - */ - public RMDDataState(final String name, final PeekableIterator iterator) { - this.dataSource = null; - this.iterator = null; - this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<>(iterator)); - } - - public void close() { - if ( dataSource != null ) - dataSource.close( iterator ); - } - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java deleted file mode 100644 index b53505097..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceView.java +++ /dev/null @@ -1,236 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.reference.ReferenceSequence; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.walkers.Reference; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.engine.walkers.Window; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Provides access to the portion of the reference covering a single locus. - */ -public class LocusReferenceView extends ReferenceView { - /** - * Bound the reference view to make sure all accesses are within the shard. - */ - private GenomeLoc bounds; - - /** - * Start of the expanded window for which the reference context should be provided, - * relative to the locus in question. - */ - private final int windowStart; - - - /** - * Start of the expanded window for which the reference context should be provided, - * relative to the locus in question. - */ - private final int windowStop; - - /** - * Track the reference sequence and the last point accessed. Used to - * track state when traversing over the reference. - */ - private ReferenceSequence referenceSequence; - - /** - * Create a LocusReferenceView given no other contextual information about - * the walkers, etc. - * @param provider source for locus data. - */ - public LocusReferenceView( LocusShardDataProvider provider ) { - super(provider); - initializeBounds(provider); - windowStart = windowStop = 0; - initializeReferenceSequence(bounds); - } - - /** - * Create a new locus reference view. - * @param provider source for locus data. - */ - public LocusReferenceView( Walker walker, LocusShardDataProvider provider ) { - super( provider ); - initializeBounds(provider); - - // Retrieve information about the window being accessed. - if( walker.getClass().isAnnotationPresent(Reference.class) ) { - Window window = walker.getClass().getAnnotation(Reference.class).window(); - - if( window.start() > 0 ) throw new ReviewedGATKException( "Reference window starts after current locus" ); - if( window.stop() < 0 ) throw new ReviewedGATKException( "Reference window ends before current locus" ); - - windowStart = window.start(); - windowStop = window.stop(); - } - else { - windowStart = 0; - windowStop = 0; - } - - if(bounds != null) { - int expandedStart = getWindowStart( bounds ); - int expandedStop = getWindowStop( bounds ); - initializeReferenceSequence(genomeLocParser.createGenomeLoc(bounds.getContig(), bounds.getContigIndex(), expandedStart, expandedStop)); - } - } - - /** - * Initialize the bounds of this shard, trimming the bounds so that they match the reference. - * @param provider Provider covering the appropriate locus. - */ - private void initializeBounds(LocusShardDataProvider provider) { - if(provider.getLocus() != null) { - int sequenceLength = reference.getSequenceDictionary().getSequence(provider.getLocus().getContig()).getSequenceLength(); - bounds = genomeLocParser.createGenomeLoc(provider.getLocus().getContig(), - Math.max(provider.getLocus().getStart(),1), - Math.min(provider.getLocus().getStop(),sequenceLength)); - } - else - bounds = null; - } - - /** - * Initialize reference sequence data using the given locus. - * @param locus - */ - private void initializeReferenceSequence( GenomeLoc locus ) { - this.referenceSequence = reference.getSubsequenceAt( locus.getContig(), locus.getStart(), locus.getStop() ); - } - - protected GenomeLoc trimToBounds(GenomeLoc l) { - int expandedStart = getWindowStart( bounds ); - int expandedStop = getWindowStop( bounds ); - if ( l.getStart() < expandedStart ) l = genomeLocParser.setStart(l, expandedStart); - if ( l.getStop() > expandedStop ) l = genomeLocParser.setStop(l, expandedStop); - return l; - } - - public class Provider implements ReferenceContext.ReferenceContextRefProvider { - int refStart, len; - - public Provider( int refStart, int len ) { - this.refStart = refStart; - this.len = len; - } - - public byte[] getBases() { - //System.out.printf("Getting bases for location%n"); - byte[] bases = new byte[len]; - System.arraycopy(referenceSequence.getBases(), refStart, bases, 0, len); - return bases; - } - } - - /** - * Gets the reference context associated with this particular point or extended interval on the genome. - * @param genomeLoc Region for which to retrieve the base(s). If region spans beyond contig end or beyond current bounds, it will be trimmed down. - * @return The base at the position represented by this genomeLoc. - */ - public ReferenceContext getReferenceContext( GenomeLoc genomeLoc ) { - //validateLocation( genomeLoc ); - - GenomeLoc window = genomeLocParser.createGenomeLoc( genomeLoc.getContig(), genomeLoc.getContigIndex(), - getWindowStart(genomeLoc), getWindowStop(genomeLoc) ); - - int refStart = -1; - if (bounds != null) { - window = trimToBounds(window); - refStart = (int)(window.getStart() - getWindowStart(bounds)); - } - else { - if(referenceSequence == null || referenceSequence.getContigIndex() != genomeLoc.getContigIndex()) - referenceSequence = reference.getSequence(genomeLoc.getContig()); - refStart = (int)window.getStart()-1; - } - - int len = (int)window.size(); - return new ReferenceContext( genomeLocParser, genomeLoc, window, new Provider(refStart, len)); - } - - /** - * Allow the user to pull reference info from any arbitrary region of the reference. - * @param genomeLoc The locus. - * @return A list of the bases starting at the start of the locus (inclusive) and ending - * at the end of the locus (inclusive). - */ - public byte[] getReferenceBases( GenomeLoc genomeLoc ) { - return super.getReferenceBases(genomeLoc); - } - - /** - * Gets the start of the expanded window, bounded if necessary by the contig. - * @param locus The locus to expand. - * @return The expanded window. - */ - private int getWindowStart( GenomeLoc locus ) { - // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. - if(locus.getStart() < 1) return 1; -// if(locus.getStart() < 1) return locus.getStart(); - return Math.max( locus.getStart() + windowStart, 1 ); - } - - /** - * Gets the stop of the expanded window, bounded if necessary by the contig. - * @param locus The locus to expand. - * @return The expanded window. - */ - private int getWindowStop( GenomeLoc locus ) { - // If the locus is not within the bounds of the contig it allegedly maps to, expand only as much as we can. - int sequenceLength = reference.getSequenceDictionary().getSequence(locus.getContig()).getSequenceLength(); - if(locus.getStop() > sequenceLength) return sequenceLength; - return Math.min( locus.getStop() + windowStop, sequenceLength ); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java deleted file mode 100644 index 9bc37e549..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/LocusView.java +++ /dev/null @@ -1,220 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; - -import java.util.Arrays; -import java.util.Collection; -import java.util.NoSuchElementException; - -/** - * User: hanna - * Date: May 13, 2009 - * Time: 3:30:16 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * The two goals of the LocusView are as follows: - * 1) To provide a 'trigger track' iteration interface so that TraverseLoci can easily switch - * between iterating over all bases in a region, only covered bases in a region covered by - * reads, only bases in a region covered by RODs, or any other sort of trigger track - * implementation one can think of. - * 2) To manage the copious number of iterators that have to be jointly pulled through the - * genome to make a locus traversal function. - */ -public abstract class LocusView extends LocusIterator implements View { - /** - * The locus bounding this view. - */ - protected GenomeLoc locus; - - /** - * The GenomeLocParser, used to create new genome locs. - */ - protected GenomeLocParser genomeLocParser; - - /** - * Source info for this view. Informs the class about downsampling requirements. - */ - private ReadProperties sourceInfo; - - /** - * The actual locus context iterator. - */ - private LocusIterator loci; - - /** - * The next locus context from the iterator. Lazy loaded: if nextLocus is null and advance() doesn't - * populate it, the iterator is exhausted. If populated, this is the value that should be returned by - * next(). - */ - private AlignmentContext nextLocus = null; - - public LocusView(LocusShardDataProvider provider) { - this.locus = provider.getLocus(); - - this.sourceInfo = provider.getSourceInfo(); - this.genomeLocParser = provider.getGenomeLocParser(); - this.loci = provider.getLocusIterator(); - - advance(); - - provider.register(this); - } - - /** - * Only one view of the locus is supported at any given time. - * @return A list consisting of all other locus views. - */ - public Collection> getConflictingViews() { - return Arrays.>asList(LocusView.class,ReadView.class); - } - - /** - * Close this view. - */ - public void close() { - // Set everything to null with the hope of failing fast. - locus = null; - sourceInfo = null; - loci = null; - - super.close(); - } - - /** - * Is there another covered locus context bounded by this view. - * @return True if another covered locus context exists. False otherwise. - */ - public abstract boolean hasNext(); - - /** - * Returns the next covered locus context in the shard. - * @return Next covered locus context in the shard. - * @throw NoSuchElementException if no such element exists. - */ - public abstract AlignmentContext next(); - - /** - * Unsupported. - * @throw UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Unable to remove elements from this queue."); - } - - /** - * Is there another locus context bounded by this shard. - * @return True if another locus context is bounded by this shard. - */ - protected boolean hasNextLocus() { - advance(); - return nextLocus != null; - } - - /** - * Get the next locus context bounded by this shard. - * @return Next locus context bounded by this shard. - * @throw NoSuchElementException if the next element is missing. - */ - protected AlignmentContext nextLocus() { - advance(); - if(nextLocus == null) - throw new NoSuchElementException("No more elements remain in locus context queue."); - - // Cache the current and apply filtering. - AlignmentContext current = nextLocus; - - // Indicate that the next operation will need to advance. - nextLocus = null; - - return current; - } - - /** - * Seed the nextLocus variable with the contents of the next locus (if one exists). - */ - private void advance() { - // Already an unclaimed locus present - if(nextLocus != null) - return; - - //System.out.printf("loci is %s%n", loci); - if( !loci.hasNext() ) { - nextLocus = null; - return; - } - - nextLocus = loci.next(); - - // If the location of this shard is available, trim the data stream to match the shard. - // TODO: Much of this functionality is being replaced by the WindowMaker. - if(locus != null) { - // Iterate through any elements not contained within this shard. - while( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) && loci.hasNext() ) - nextLocus = loci.next(); - - // If nothing in the shard was found, indicate that by setting nextLocus to null. - if( nextLocus != null && !isContainedInShard(nextLocus.getLocation()) ) - nextLocus = null; - } - } - - /** - * Is this location contained in the given shard. - * @param location Location to check. - * @return True if the given location is contained within the shard. False otherwise. - */ - private boolean isContainedInShard(GenomeLoc location) { - return locus.containsP(location); - } - - /** - * {@inheritDoc} - * - * Since this class has an actual LIBS, so this function will never throw an exception - * - * @return the LocusIteratorByState used by this view to get pileups - */ - @Override - public LocusIteratorByState getLIBS() { - return loci.getLIBS(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java deleted file mode 100644 index 2dd42c1cc..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ManagingReferenceOrderedView.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -/** - * User: hanna - * Date: May 21, 2009 - * Time: 2:49:17 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A view into the reference-ordered data in the provider. - */ -public class ManagingReferenceOrderedView implements ReferenceOrderedView { - /** - * The data sources along with their current states. - */ - private List states = new ArrayList(); - - /** - * Create a new view of reference-ordered data. - * @param provider - */ - public ManagingReferenceOrderedView( LocusShardDataProvider provider ) { - for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) - states.add(new ReferenceOrderedDataState(dataSource, dataSource.seek(provider.getLocus()))); - - provider.register(this); - } - - public Collection> getConflictingViews() { return Collections.emptyList(); } - - /** - * Gets an object which can track the reference-ordered data at every locus. - * @param loc Locus at which to track. - * @return A tracker containing information about this locus. - */ - @Override - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { - if ( states.isEmpty() ) - return RefMetaDataTracker.EMPTY_TRACKER; - else { - List bindings = new ArrayList(states.size()); - - for ( ReferenceOrderedDataState state: states ) - // todo -- warning, I removed the reference to the name from states - bindings.add( state.iterator.seekForward(loc) ); - - return new RefMetaDataTracker(bindings); - } - } - - /** - * Closes the current view. - */ - public void close() { - for( ReferenceOrderedDataState state: states ) - state.dataSource.close( state.iterator ); - - // Clear out the existing data so that post-close() accesses to this data will fail-fast. - states = null; - } -} - -/** - * Models the traversal state of a given ROD lane. - */ -class ReferenceOrderedDataState { - public final ReferenceOrderedDataSource dataSource; - public final LocationAwareSeekableRODIterator iterator; - - public ReferenceOrderedDataState( ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator ) { - this.dataSource = dataSource; - this.iterator = iterator; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java deleted file mode 100644 index f244e504d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RODMetaDataContainer.java +++ /dev/null @@ -1,83 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.utils.collections.Pair; - -import java.util.*; - - -/** - * - * @author aaron - * - * Class RODMetaDataContainer - * - * stores both the name and the class for each ROD. This class assumes that: - * - * -Names must be unique - * -Classes are allowed to have duplicates - * - * This class encapsulates the ref data associations, and provides lookup by name and by - * class type. - * - */ -public class RODMetaDataContainer { - // we only allow non-duplicate ROD names, a HashMap is fine - private final HashMap nameMap = new HashMap(); - - // we do allow duplicate class entries, so we need to store pairs of data - private final List> classMap = new ArrayList>(); - - public void addEntry(GATKFeature data) { - nameMap.put(data.getName(),data); - classMap.add(new Pair(data.getClass(),data)); - } - - public Collection getSet(String name) { - if (name == null) return getSet(); - Set set = new HashSet(); - if (nameMap.containsKey(name)) set.add(nameMap.get(name)); - return set; - } - - /** - * get the feature contents of this container; the unfiltered set without their name association - * @return - */ - public Collection getSet() { - return new ArrayList(nameMap.values()); - } - - // the brute force (n) search ended up being faster than sorting and binary search in all but the most extreme cases (thousands of RODs at a location). - public Collection getSet(Class cls) { - Collection ret = new ArrayList(); - for (Pair pair: classMap) - if (pair.first.equals(cls)) ret.add(pair.second); - return ret; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java deleted file mode 100644 index 1d73501bd..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadBasedReferenceOrderedView.java +++ /dev/null @@ -1,69 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.datasources.reads.ReadShard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */ -public class ReadBasedReferenceOrderedView extends IntervalReferenceOrderedView { - public ReadBasedReferenceOrderedView(final ShardDataProvider provider) { - super(provider, provider.hasReferenceOrderedData() ? ((ReadShard)provider.getShard()).getReadsSpan() : null); - } - - /** - * create a RefMetaDataTracker given the current read - * - * @param rec the read - * - * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments - */ - @Requires("rec != null") - @Ensures("result != null") - public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) { - if ( rec.getReadUnmappedFlag() ) - return RefMetaDataTracker.EMPTY_TRACKER; - else { - final GenomeLoc readSpan = genomeLocParser.createGenomeLoc(rec); - trimCurrentFeaturesToLoc(readSpan); - return getReferenceOrderedDataForInterval(readSpan); - } - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java deleted file mode 100644 index 14d5827a3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadReferenceView.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.GenomeLoc; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * User: hanna - * Date: May 22, 2009 - * Time: 12:36:14 PM - * - */ - -/** Provides access to the reference over a single read. */ - -public class ReadReferenceView extends ReferenceView { - /** - * Create a view of the reference with respect to a single read. - * - * @param provider - */ - public ReadReferenceView( ShardDataProvider provider ) { - super(provider); - } - - protected ReferenceContext.ReferenceContextRefProvider getReferenceBasesProvider( GenomeLoc genomeLoc ) { - return new Provider(genomeLoc); - } - - public class Provider implements ReferenceContext.ReferenceContextRefProvider { - GenomeLoc loc; - - public Provider( GenomeLoc loc ) { - this.loc = loc; - } - - public byte[] getBases() { - return getReferenceBases(loc); - } - } - - /** - * Return a reference context appropriate for the span of read - * - * @param read the mapped read to test - * @return - */ - public ReferenceContext getReferenceContext( final SAMRecord read ) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(read); - return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) ); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java deleted file mode 100644 index 8acfad0b1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadShardDataProvider.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.util.Collection; - -/** - * Present data sharded by read to a traversal engine. - * - * @author mhanna - * @version 0.1 - */ -public class ReadShardDataProvider extends ShardDataProvider { - /** - * The raw collection of reads. - */ - private final GATKSAMIterator reads; - - /** - * Create a data provider for the shard given the reads and reference. - * @param shard The chunk of data over which traversals happen. - * @param reference A getter for a section of the reference. - */ - public ReadShardDataProvider(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator reads, IndexedFastaSequenceFile reference, Collection rods) { - super(shard,genomeLocParser,reference,rods); - this.reads = reads; - } - - /** - * Can this data source provide reads? - * @return True if reads are available, false otherwise. - */ - public boolean hasReads() { - return reads != null; - } - - /** - * Gets an iterator over all the reads bound by this shard. - * @return An iterator over all reads in this shard. - */ - public GATKSAMIterator getReadIterator() { - return reads; - } - - @Override - public void close() { - super.close(); - - if(reads != null) - reads.close(); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java deleted file mode 100644 index 160dbd585..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReadView.java +++ /dev/null @@ -1,88 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; - -import java.util.Arrays; -import java.util.Collection; -/** - * User: hanna - * Date: May 22, 2009 - * Time: 12:06:54 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A view into the reads that a provider can provide. - */ -public class ReadView implements View, Iterable { - /** - * The iterator into the reads supplied by this provider. - */ - private GATKSAMIterator reads; - - /** - * Create a new view of the reads given the current data set. - * @param provider Source for the data. - */ - public ReadView( ReadShardDataProvider provider ) { - reads = provider.getReadIterator(); - } - - /** - * Other reads and loci conflict with this view. - * @return Array of reads and loci. - */ - public Collection> getConflictingViews() { - return Arrays.>asList(ReadView.class, LocusView.class); - } - - /** - * Close the view over these reads. Note that this method closes just - * the view into the reads, not the reads themselves. - */ - public void close() { - // Don't close the reads. The provider is responsible for this. - // Just dispose of the pointer. - reads = null; - } - - /** - * Gets an iterator into the reads supplied by this provider. - * @return Iterator into the reads that this provider covers. - */ - public GATKSAMIterator iterator() { - return reads; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java deleted file mode 100644 index 9f3db5143..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedView.java +++ /dev/null @@ -1,33 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.GenomeLoc; - -public interface ReferenceOrderedView extends View { - RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java deleted file mode 100644 index 21cb3efa6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/providers/RodLocusView.java +++ /dev/null @@ -1,197 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.collections.RODMergingIterator; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; - -import java.util.*; - -/** - * A view into the reference-ordered data in the provider. - */ -public class RodLocusView extends LocusView implements ReferenceOrderedView { - /** - * The data sources along with their current states. - */ - private RODMergingIterator rodQueue = null; - - Collection allTracksHere; - - GenomeLoc lastLoc = null; - RODRecordList interval = null; - - /** - * The data sources along with their current states. - */ - private List states = new ArrayList(); - - /** - * Enable debugging output -- todo remove me - */ - final static boolean DEBUG = false; - - final static String INTERVAL_ROD_NAME = "interval"; - - /** - * Create a new view of reference-ordered data. - * - * @param provider - */ - public RodLocusView( LocusShardDataProvider provider ) { - super(provider); - - GenomeLoc loc = provider.getLocus(); - - List< Iterator > iterators = new LinkedList< Iterator >(); - for( ReferenceOrderedDataSource dataSource: provider.getReferenceOrderedData() ) { - if ( DEBUG ) System.out.printf("Shard is %s%n", provider.getLocus()); - - // grab the ROD iterator from the data source, and compute the first location in this shard, forwarding - // the iterator to immediately before it, so that it can be added to the merging iterator primed for - // next() to return the first real ROD in this shard - LocationAwareSeekableRODIterator it = dataSource.seek(provider.getLocus()); - it.seekForward(genomeLocParser.createGenomeLoc(loc.getContig(), loc.getStart()-1)); - - states.add(new ReferenceOrderedDataState(dataSource,it)); - - // we need to special case the interval so we don't always think there's a rod at the first location - if ( dataSource.getName().equals(INTERVAL_ROD_NAME) ) { - if ( interval != null ) - throw new RuntimeException("BUG: interval local variable already assigned " + interval); - interval = it.next(); - } else { - iterators.add( it ); - } - } - - rodQueue = new RODMergingIterator(iterators); - } - - @Override - public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc ) { - // special case the interval again -- add it into the ROD - if ( interval != null ) { allTracksHere.add(interval); } - return new RefMetaDataTracker(allTracksHere); - } - - public boolean hasNext() { - if ( ! rodQueue.hasNext() ) - return false; - else { - return ! rodQueue.peekLocation().isPast(locus); - } - } - - /** - * Returns the next covered locus context in the shard. - * @return Next covered locus context in the shard. - * @throw NoSuchElementException if no such element exists. - */ - public AlignmentContext next() { - if ( DEBUG ) System.out.printf("In RodLocusView.next()...%n"); - RODRecordList datum = rodQueue.next(); - if ( DEBUG ) System.out.printf("In RodLocusView.next(); datum = %s...%n", datum.getLocation()); - - if ( DEBUG ) System.out.printf("In RodLocusView.next(): creating tracker...%n"); - - allTracksHere = getSpanningTracks(datum); - GenomeLoc rodSite = datum.getLocation(); - GenomeLoc site = genomeLocParser.createGenomeLoc( rodSite.getContig(), rodSite.getStart(), rodSite.getStart()); - - if ( DEBUG ) System.out.printf("rodLocusView.next() is at %s%n", site); - - // calculate the number of skipped bases, and update lastLoc so we can do that again in the next() - long skippedBases = getSkippedBases( rodSite ); - lastLoc = site; - return new AlignmentContext(site, new ReadBackedPileupImpl(site), skippedBases); - } - - private Collection getSpanningTracks(RODRecordList marker) { - return rodQueue.allElementsLTE(marker); - } - - /** - * Returns the number of reference bases that have been skipped: - * - * 1 -- since the last processed location if we have one - * 2 -- from the beginning of the shard if this is the first loc - * 3 -- from the last location to the current position - * - * @param currentPos - * @return - */ - private long getSkippedBases( GenomeLoc currentPos ) { - // the minus - is because if lastLoc == null, you haven't yet seen anything in this interval, so it should also be counted as skipped - Integer compStop = lastLoc == null ? locus.getStart() - 1 : lastLoc.getStop(); - long skippedBases = currentPos.getStart() - compStop - 1; - - if ( skippedBases < -1 ) { // minus 1 value is ok - throw new RuntimeException(String.format("BUG: skipped bases=%d is < 0: cur=%s vs. last=%s, shard=%s", - skippedBases, currentPos, lastLoc, locus)); - } - return Math.max(skippedBases, 0); - } - - /** - * Get the location one after the last position we will traverse through - * @return - */ - public GenomeLoc getLocOneBeyondShard() { - return genomeLocParser.createGenomeLoc(locus.getContig(),locus.getStop()+1); - } - - /** - * How many bases are we skipping from the current location to the end of the interval / shard - * if we have no more elements - * - * @return - */ - public long getLastSkippedBases() { - if ( hasNext() ) - throw new RuntimeException("BUG: getLastSkippedBases called when there are elements remaining."); - - return getSkippedBases(getLocOneBeyondShard()); - } - - /** - * Closes the current view. - */ - public void close() { - for( ReferenceOrderedDataState state: states ) - state.dataSource.close( state.iterator ); - - rodQueue = null; - allTracksHere = null; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java deleted file mode 100644 index 1e30d6c38..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMAccessPlan.java +++ /dev/null @@ -1,170 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.util.BlockCompressedFilePointerUtil; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.LinkedList; -import java.util.List; - -/** -* Created by IntelliJ IDEA. -* User: mhanna -* Date: 10/14/11 -* Time: 10:47 PM -* To change this template use File | Settings | File Templates. -*/ -class BAMAccessPlan { - private final SAMReaderID reader; - private final BlockInputStream inputStream; - - private final List positions; - private PeekableIterator positionIterator; - - /** - * Stores the next block address to read, or -1 if no such block is available. - */ - private long nextBlockAddress; - - - BAMAccessPlan(final SAMReaderID reader, final BlockInputStream inputStream, GATKBAMFileSpan fileSpan) { - this.reader = reader; - this.inputStream = inputStream; - - this.positions = fileSpan.getGATKChunks(); - initialize(); - } - - public SAMReaderID getReader() { - return reader; - } - - public BlockInputStream getInputStream() { - return inputStream; - } - - /** - * Retrieves the next block address to be read. - * @return Next block address to be read. - */ - public long getBlockAddress() { - return nextBlockAddress; - } - - /** - * Retrieves the first offset of interest in the block returned by getBlockAddress(). - * @return First block of interest in this segment. - */ - public int getFirstOffsetInBlock() { - return (nextBlockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; - } - - /** - * Gets the spans overlapping the given block; used to copy the contents of the block into the circular buffer. - * @param blockAddress Block address for which to search. - * @param filePosition Block address at which to terminate the last chunk if the last chunk goes beyond this span. - * @return list of chunks containing that block. - */ - public List getSpansOverlappingBlock(long blockAddress, long filePosition) { - List spansOverlapping = new LinkedList(); - // While the position iterator overlaps the given block, pull out spans to report. - while(positionIterator.hasNext() && positionIterator.peek().getBlockStart() <= blockAddress) { - // Create a span over as much of the block as is covered by this chunk. - int blockOffsetStart = (blockAddress == positionIterator.peek().getBlockStart()) ? positionIterator.peek().getBlockOffsetStart() : 0; - - // Calculate the end of this span. If the span extends past this block, cap it using the current file position. - long blockEnd; - int blockOffsetEnd; - if(blockAddress < positionIterator.peek().getBlockEnd()) { - blockEnd = filePosition; - blockOffsetEnd = 0; - } - else { - blockEnd = positionIterator.peek().getBlockEnd(); - blockOffsetEnd = positionIterator.peek().getBlockOffsetEnd(); - } - - GATKChunk newChunk = new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd); - - if(newChunk.getChunkStart() <= newChunk.getChunkEnd()) - spansOverlapping.add(new GATKChunk(blockAddress,blockOffsetStart,blockEnd,blockOffsetEnd)); - - // If the value currently stored in the position iterator ends past the current block, we must be done. Abort. - if(!positionIterator.hasNext() || positionIterator.peek().getBlockEnd() > blockAddress) - break; - - // If the position iterator ends before the block ends, pull the position iterator forward. - if(positionIterator.peek().getBlockEnd() <= blockAddress) - positionIterator.next(); - } - - return spansOverlapping; - } - - public void reset() { - initialize(); - } - - /** - * Resets the SAM reader position to its original state. - */ - private void initialize() { - this.positionIterator = new PeekableIterator(positions.iterator()); - if(positionIterator.hasNext()) - nextBlockAddress = positionIterator.peek().getBlockStart(); - else - nextBlockAddress = -1; - } - - /** - * Advances the current position to the next block to read, given the current position in the file. - * @param filePosition The current position within the file. - */ - void advancePosition(final long filePosition) { - nextBlockAddress = BlockCompressedFilePointerUtil.getBlockAddress(filePosition); - - // Check the current file position against the iterator; if the iterator is before the current file position, - // draw the iterator forward. Remember when performing the check that coordinates are half-open! - while(positionIterator.hasNext() && isFilePositionPastEndOfChunk(filePosition,positionIterator.peek())) - positionIterator.next(); - - // If the block iterator has shot past the file pointer, bring the file pointer flush with the start of the current block. - if(positionIterator.hasNext() && filePosition < positionIterator.peek().getChunkStart()) - nextBlockAddress = positionIterator.peek().getBlockStart(); - - // If we've shot off the end of the block pointer, notify consumers that iteration is complete. - if(!positionIterator.hasNext()) - nextBlockAddress = -1; - } - - private boolean isFilePositionPastEndOfChunk(final long filePosition, final GATKChunk chunk) { - return filePosition >= chunk.getChunkEnd(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java deleted file mode 100644 index a80b0a475..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMSchedule.java +++ /dev/null @@ -1,530 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.Bin; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; -import java.io.RandomAccessFile; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.*; - -/** - * Writes schedules for a single BAM file to a target output file. - */ -public class BAMSchedule implements CloseableIterator { - /** - * File in which to store schedule data. - */ - private File scheduleFile; - - /** - * File channel for the schedule file. - */ - private FileChannel scheduleFileChannel; - - /** - * The definitive, sorted list of reader IDs. Order is important here: the order - * in which the reader IDs are presented here maps to the order in which they appear in the file. - */ - private final List readerIDs = new ArrayList(); - - /** - * Iterators over the schedule. Stored in the same order as readerIDs, above. - */ - private final List> scheduleIterators = new ArrayList>(); - - /** - * Next schedule entry to be returned. Null if no additional entries are present. - */ - private BAMScheduleEntry nextScheduleEntry; - - /** - * Reference sequence for which to write the schedule. - */ - private final int referenceSequence; - - /** - * Sizes of ints and longs in bytes. - */ - private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; - private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; - - /** - * Create a new BAM schedule based on the given index. - * @param dataSource The SAM data source to use. - * @param intervals List of - */ - public BAMSchedule(final SAMDataSource dataSource, final List intervals) { - if(intervals.isEmpty()) - throw new ReviewedGATKException("Tried to write schedule for empty interval list."); - - referenceSequence = dataSource.getHeader().getSequence(intervals.get(0).getContig()).getSequenceIndex(); - - createScheduleFile(); - - readerIDs.addAll(dataSource.getReaderIDs()); - - for(final SAMReaderID reader: readerIDs) { - final GATKBAMIndex index = dataSource.getIndex(reader); - final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence); - - int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); - Iterator locusIterator = intervals.iterator(); - GenomeLoc currentLocus = locusIterator.next(); - - final long readerStartOffset = position(); - - int maxChunkCount = 0; - - while(currentBinInLowestLevel < GATKBAMIndex.MAX_BINS && currentLocus != null) { - final Bin bin = new Bin(referenceSequence,currentBinInLowestLevel); - final int binStart = index.getFirstLocusInBin(bin); - final int binStop = index.getLastLocusInBin(bin); - - // In required, pull bin iterator ahead to the point of the next GenomeLoc. - if(binStop < currentLocus.getStart()) { - currentBinInLowestLevel++; - continue; - } - - // At this point, the bin stop is guaranteed to be >= the start of the locus. - // If the bins have gone past the current locus, update the current locus if at all possible. - if(binStart > currentLocus.getStop()) { - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - continue; - } - - // Code at this point knows that the current bin is neither before nor after the current locus, - // so it must overlap. Add this region to the filesystem. - final GATKBAMFileSpan fileSpan = indexData.getSpanOverlapping(bin); - - if(!fileSpan.isEmpty()) { - // File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves. - ByteBuffer buffer = allocateByteBuffer(2*INT_SIZE_IN_BYTES + INT_SIZE_IN_BYTES + fileSpan.getGATKChunks().size()*LONG_SIZE_IN_BYTES*2); - buffer.putInt(binStart); - buffer.putInt(binStop); - buffer.putInt(fileSpan.getGATKChunks().size()); - for(GATKChunk chunk: fileSpan.getGATKChunks()) { - buffer.putLong(chunk.getChunkStart()); - buffer.putLong(chunk.getChunkEnd()); - } - maxChunkCount = Math.max(maxChunkCount,fileSpan.getGATKChunks().size()); - - // Prepare buffer for writing - buffer.flip(); - - // And write. - write(buffer); - } - - currentBinInLowestLevel++; - } - - final long readerStopOffset = position(); - - scheduleIterators.add(new PeekableIterator(new BAMScheduleIterator(reader,readerStartOffset,readerStopOffset,maxChunkCount))); - - // Iterator initialization might move the file pointer. Make sure it gets reset back to where it was before iterator initialization. - position(readerStopOffset); - } - - advance(); - } - - /** - * Determine whether more ScheduleEntries are present in the iterator. - * @return Next schedule entry to parse. - */ - @Override - public boolean hasNext() { - return nextScheduleEntry != null; - } - - /** - * Retrieve the next schedule entry in the list. - * @return next schedule entry in the queue. - */ - @Override - public BAMScheduleEntry next() { - BAMScheduleEntry currentScheduleEntry = nextScheduleEntry; - advance(); - return currentScheduleEntry; - } - - /** - * Close down and delete the file. - */ - @Override - public void close() { - try { - scheduleFileChannel.close(); - } - catch(IOException ex) { - throw makeIOFailureException(true, "Unable to close schedule file.", ex); - } - } - - /** - * Convenience routine for creating UserExceptions - * @param wasWriting - * @param message - * @param e - * @return - */ - private final GATKException makeIOFailureException(final boolean wasWriting, final String message, final Exception e) { - if ( wasWriting ) { - if ( e == null ) - return new UserException.CouldNotCreateOutputFile(scheduleFile, message); - else - return new UserException.CouldNotCreateOutputFile(scheduleFile, message, e); - } else { - if ( e == null ) - return new UserException.CouldNotReadInputFile(scheduleFile, message); - else - return new UserException.CouldNotReadInputFile(scheduleFile, message, e); - } - } - - /** - * Advance to the next schedule entry. - */ - private void advance() { - nextScheduleEntry = null; - - BitSet selectedIterators = new BitSet(readerIDs.size()); - int currentStart = Integer.MAX_VALUE; - int currentStop = Integer.MAX_VALUE; - - // Select every iterator whose next element is the lowest element in the list. - for(int reader = 0; reader < scheduleIterators.size(); reader++) { - PeekableIterator scheduleIterator = scheduleIterators.get(reader); - if(!scheduleIterator.hasNext()) - continue; - - // If the iterator starts after this one, skip over it. - if(scheduleIterator.peek().start > currentStart) - continue; - - // If the iterator starts at the same point as this one, add it to the list. - if(scheduleIterator.peek().start == currentStart) { - selectedIterators.set(reader); - currentStop = Math.min(scheduleIterator.peek().stop,currentStop); - continue; - } - - // If the iterator is less than anything seen before it, purge the selections and make this one current. - if(scheduleIterator.peek().start < currentStart) { - selectedIterators.clear(); - selectedIterators.set(reader); - currentStart = scheduleIterator.peek().start; - currentStop = scheduleIterator.peek().stop; - } - } - - // Out of iterators? Abort early. - if(selectedIterators.isEmpty()) - return; - - // Create the target schedule entry - BAMScheduleEntry mergedScheduleEntry = new BAMScheduleEntry(currentStart,currentStop); - - // For each schedule entry with data, load the data into the merged schedule. - for (int reader = selectedIterators.nextSetBit(0); reader >= 0; reader = selectedIterators.nextSetBit(reader+1)) { - PeekableIterator scheduleIterator = scheduleIterators.get(reader); - BAMScheduleEntry individualScheduleEntry = scheduleIterator.peek(); - mergedScheduleEntry.mergeInto(individualScheduleEntry); - - // If the schedule iterator ends after this entry, consume it. - if(individualScheduleEntry.stop <= currentStop) - scheduleIterator.next(); - } - - // For each schedule entry without data, add a blank entry. - for (int reader = selectedIterators.nextClearBit(0); reader < readerIDs.size(); reader = selectedIterators.nextClearBit(reader+1)) { - mergedScheduleEntry.addFileSpan(readerIDs.get(reader),new GATKBAMFileSpan()); - } - - nextScheduleEntry = mergedScheduleEntry; - } - - @Override - public void remove() { throw new UnsupportedOperationException("Unable to remove from a schedule iterator."); } - - /** - * Create a new schedule file, containing schedule information for all BAM files being dynamically merged. - */ - private void createScheduleFile() { - try { - scheduleFile = File.createTempFile("bamschedule."+referenceSequence,null); - scheduleFileChannel = new RandomAccessFile(scheduleFile,"rw").getChannel(); - } - catch(IOException ex) { - throw new UserException("Unable to create a temporary BAM schedule file. Please make sure Java can write to the default temp directory or use -Djava.io.tmpdir= to instruct it to use a different temp directory instead.",ex); - } - scheduleFile.deleteOnExit(); - - } - - /** - * Creates a new byte buffer of the given size. - * @param size the size of buffer to allocate. - * @return Newly allocated byte buffer. - */ - private ByteBuffer allocateByteBuffer(final int size) { - ByteBuffer buffer = ByteBuffer.allocate(size); - buffer.order(ByteOrder.LITTLE_ENDIAN); - return buffer; - } - - /** - * Reads the contents at the current position on disk into the given buffer. - * @param buffer buffer to fill. - */ - private int read(final ByteBuffer buffer) { - try { - return scheduleFileChannel.read(buffer); - } - catch(IOException ex) { - throw makeIOFailureException(false, "Unable to read data from BAM schedule file.", ex); - } - } - - private void write(final ByteBuffer buffer) { - try { - scheduleFileChannel.write(buffer); - if(buffer.remaining() > 0) - throw makeIOFailureException(true, "Unable to write entire buffer to file.", null); - } - catch(IOException ex) { - throw makeIOFailureException(true, "Unable to write data to BAM schedule file.", ex); - } - } - - /** - * Reads the current position from the file channel. - * @return Current position within file channel. - */ - private long position() { - try { - return scheduleFileChannel.position(); - } - catch(IOException ex) { - throw makeIOFailureException(false, "Unable to retrieve position of BAM schedule file.", ex); - } - } - - /** - * Reposition the file channel to the specified offset wrt the start of the file. - * @param position The position. - */ - private void position(final long position) { - try { - scheduleFileChannel.position(position); - } - catch(IOException ex) { - throw makeIOFailureException(false, "Unable to position BAM schedule file.",ex); - } - } - - /** - * An iterator over the schedule for a single BAM file. - */ - private class BAMScheduleIterator implements Iterator { - /** - * ID of the reader associated with the given schedule. - */ - private final SAMReaderID reader; - - /** - * Current position in the file. - */ - private long currentPosition; - - /** - * Stopping file position of last bin in file for this reader, exclusive. - */ - private final long stopPosition; - - /** - * Byte buffer used to store BAM header info. - */ - private final ByteBuffer binHeader; - - /** - * Byte buffer used to store chunk data. - */ - private final ByteBuffer chunkData; - - public BAMScheduleIterator(final SAMReaderID reader, final long startPosition, final long stopPosition, final int maxChunkCount) { - this.reader = reader; - this.currentPosition = startPosition; - this.stopPosition = stopPosition; - binHeader = allocateByteBuffer(INT_SIZE_IN_BYTES*3); - chunkData = allocateByteBuffer(maxChunkCount*LONG_SIZE_IN_BYTES*2); - } - - @Override - public boolean hasNext() { - return currentPosition < stopPosition; - } - - @Override - public BAMScheduleEntry next() { - position(currentPosition); - - // Read data. - int binHeaderBytesRead = read(binHeader); - - // Make sure we read in a complete bin header: - if ( binHeaderBytesRead < INT_SIZE_IN_BYTES * 3 ) { - throw new ReviewedGATKException(String.format("Unable to read a complete bin header from BAM schedule file %s for BAM file %s. " + - "The BAM schedule file is likely incomplete/corrupt.", - scheduleFile.getAbsolutePath(), reader.getSamFilePath())); - } - - // Decode contents. - binHeader.flip(); - final int start = binHeader.getInt(); - final int stop = binHeader.getInt(); - final int numChunks = binHeader.getInt(); - - // Prepare bin buffer for next read. - binHeader.flip(); - - // Prepare a target buffer for chunks. - GATKChunk[] chunks = new GATKChunk[numChunks]; - - // Read all chunk data. - chunkData.limit(numChunks*LONG_SIZE_IN_BYTES*2); - long bytesRead = read(chunkData); - if(bytesRead != numChunks*LONG_SIZE_IN_BYTES*2) - throw new ReviewedGATKException("Unable to read all chunks from file"); - - // Prepare for reading. - chunkData.flip(); - - for(int i = 0; i < numChunks; i++) - chunks[i] = new GATKChunk(chunkData.getLong(),chunkData.getLong()); - - // Prepare chunk buffer for next read. - chunkData.flip(); - - BAMScheduleEntry nextScheduleEntry = new BAMScheduleEntry(start,stop); - nextScheduleEntry.addFileSpan(reader,new GATKBAMFileSpan(chunks)); - - // Reset the position of the iterator at the next contig. - currentPosition = position(); - - return nextScheduleEntry; - } - - /** - * Not supported. - */ - @Override - public void remove() { - throw new UnsupportedOperationException("Unable to remove from a BAMScheduleIterator"); - } - - } -} - -/** - * A single proto-shard to be processed. - */ -class BAMScheduleEntry { - /** - * Starting position for the genomic entry. - */ - public final int start; - - /** - * Ending position for the genomic entry. - */ - public final int stop; - - /** - * The spans representing the given region. - */ - public final Map fileSpans = new HashMap(); - - BAMScheduleEntry(final int start, final int stop) { - this.start = start; - this.stop = stop; - } - - /** - * Add a new file span to this schedule. - * @param reader Reader associated with the span. - * @param fileSpan Blocks to read in the given reader. - */ - public void addFileSpan(final SAMReaderID reader, final GATKBAMFileSpan fileSpan) { - fileSpans.put(reader,fileSpan); - } - - /** - * A naive merge operation. Merge the fileSpans in other into this, blowing up if conflicts are - * detected. Completely ignores merging start and stop. - * @param other Other schedule entry to merging into this one. - */ - public void mergeInto(final BAMScheduleEntry other) { - final int thisSize = fileSpans.size(); - final int otherSize = other.fileSpans.size(); - fileSpans.putAll(other.fileSpans); - if(fileSpans.size() != thisSize+otherSize) - throw new ReviewedGATKException("Unable to handle overlaps when merging BAM schedule entries."); - } - - /** - * Returns true if the location of this bin tree is before the given position. - * @param locus Locus to test. - * @return True if this bin sits completely before the given locus; false otherwise. - */ - public boolean isBefore(final GenomeLoc locus) { - return stop < locus.getStart(); - } - - /** - * Checks overlap between this bin tree and other bin trees. - * @param position the position over which to detect overlap. - * @return True if the segment overlaps. False otherwise. - */ - public boolean overlaps(final GenomeLoc position) { - return !(position.getStop() < start || position.getStart() > stop); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java deleted file mode 100644 index 1ea8d39aa..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BAMScheduler.java +++ /dev/null @@ -1,320 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.util.*; - -/** - * Assign intervals to the most appropriate blocks, keeping as little as possible in memory at once. - */ -public class BAMScheduler implements Iterator { - private final SAMDataSource dataSource; - - private final Map indexFiles = new HashMap(); - - private FilePointer nextFilePointer = null; - - private GenomeLocSortedSet loci; - private PeekableIterator locusIterator; - private GenomeLoc currentLocus; - private IntervalMergingRule intervalMergingRule; - - /* - * Creates BAMScheduler using contigs from the given BAM data source. - * - * @param dataSource BAM source - * @return non-null BAM scheduler - */ - public static BAMScheduler createOverMappedReads(final SAMDataSource dataSource) { - final BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); - final GenomeLocSortedSet intervals = GenomeLocSortedSet.createSetFromSequenceDictionary(dataSource.getHeader().getSequenceDictionary()); - scheduler.populateFilteredIntervalList(intervals); - return scheduler; - } - - public static BAMScheduler createOverAllReads(final SAMDataSource dataSource, final GenomeLocParser parser) { - BAMScheduler scheduler = new BAMScheduler(dataSource, IntervalMergingRule.ALL); - scheduler.populateUnfilteredIntervalList(parser); - return scheduler; - } - - public static BAMScheduler createOverIntervals(final SAMDataSource dataSource, final IntervalMergingRule mergeRule, final GenomeLocSortedSet loci) { - BAMScheduler scheduler = new BAMScheduler(dataSource, mergeRule); - scheduler.populateFilteredIntervalList(loci); - return scheduler; - } - - - private BAMScheduler(final SAMDataSource dataSource, final IntervalMergingRule mergeRule) { - this.dataSource = dataSource; - this.intervalMergingRule = mergeRule; - for(SAMReaderID reader: dataSource.getReaderIDs()) { - GATKBAMIndex index = dataSource.getIndex(reader); - if(index != null) - indexFiles.put(reader,dataSource.getIndex(reader)); - } - } - - /** - * The consumer has asked for a bounded set of locations. Prepare an iterator over those locations. - * @param loci The list of locations to search and iterate over. - */ - private void populateFilteredIntervalList(final GenomeLocSortedSet loci) { - this.loci = loci; - if(!indexFiles.isEmpty()) { - // If index data is available, start up the iterator. - locusIterator = new PeekableIterator(loci.iterator()); - if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - advance(); - } - else { - // Otherwise, seed the iterator with a single file pointer over the entire region. - nextFilePointer = generatePointerOverEntireFileset(); - for(GenomeLoc locus: loci) - nextFilePointer.addLocation(locus); - locusIterator = new PeekableIterator(Collections.emptyList().iterator()); - } - } - - /** - * The consumer has provided null, meaning to iterate over all available data. Create a file pointer stretching - * from just before the start of the region to the end of the region. - */ - private void populateUnfilteredIntervalList(final GenomeLocParser parser) { - this.loci = new GenomeLocSortedSet(parser); - locusIterator = new PeekableIterator(Collections.emptyList().iterator()); - nextFilePointer = generatePointerOverEntireFileset(); - } - - /** - * Generate a span that runs from the end of the BAM header to the end of the fle. - * @return A file pointer over the specified region. - */ - private FilePointer generatePointerOverEntireFileset() { - FilePointer filePointer = new FilePointer(intervalMergingRule); - - // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is - // the only FilePointer we will create. This allows us to have this FilePointer represent regions from - // multiple contigs - filePointer.setIsMonolithic(true); - - Map currentPosition; - - currentPosition = dataSource.getInitialReaderPositions(); - - for(SAMReaderID reader: dataSource.getReaderIDs()) - filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); - return filePointer; - } - - public boolean hasNext() { - return nextFilePointer != null; - } - - public FilePointer next() { - if(!hasNext()) - throw new NoSuchElementException("No next element available in interval sharder"); - FilePointer currentFilePointer = nextFilePointer; - nextFilePointer = null; - advance(); - - return currentFilePointer; - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove FilePointers from an IntervalSharder"); - } - - private void advance() { - if(loci.isEmpty()) - return; - - while(nextFilePointer == null && currentLocus != null) { - // special case handling of the unmapped shard. - if(currentLocus == GenomeLoc.UNMAPPED) { - nextFilePointer = new FilePointer(intervalMergingRule, GenomeLoc.UNMAPPED); - for(SAMReaderID id: dataSource.getReaderIDs()) - nextFilePointer.addFileSpans(id,createSpanToEndOfFile(indexFiles.get(id).getStartOfLastLinearBin())); - currentLocus = null; - continue; - } - - nextFilePointer = new FilePointer(intervalMergingRule); - - int coveredRegionStart = 1; - int coveredRegionStop = Integer.MAX_VALUE; - GenomeLoc coveredRegion = null; - - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(currentLocus); - - // No overlapping data at all. - if(scheduleEntry != null) { - coveredRegionStart = Math.max(coveredRegionStart,scheduleEntry.start); - coveredRegionStop = Math.min(coveredRegionStop,scheduleEntry.stop); - coveredRegion = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStart,coveredRegionStop); - - nextFilePointer.addFileSpans(scheduleEntry.fileSpans); - } - else { - // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. - for(SAMReaderID reader: indexFiles.keySet()) - nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); - } - - // Early exit if no bins were found. - if(coveredRegion == null) { - // for debugging only: maximum split is 16384. - nextFilePointer.addLocation(currentLocus); - currentLocus = locusIterator.hasNext() ? locusIterator.next() : null; - continue; - } - - // Early exit if only part of the first interval was found. - if(currentLocus.startsBefore(coveredRegion)) { - int splitPoint = Math.min(coveredRegion.getStart()-currentLocus.getStart(),16384)+currentLocus.getStart(); - GenomeLoc[] splitContigs = currentLocus.split(splitPoint); - nextFilePointer.addLocation(splitContigs[0]); - currentLocus = splitContigs[1]; - continue; - } - - // Define the initial range of the file pointer, aka the region where the locus currently being processed intersects the BAM list. - GenomeLoc initialLocation = currentLocus.intersect(coveredRegion); - nextFilePointer.addLocation(initialLocation); - - // See whether the BAM regions discovered overlap the next set of intervals in the interval list. If so, include every overlapping interval. - if(!nextFilePointer.locations.isEmpty()) { - while(locusIterator.hasNext() && locusIterator.peek().overlapsP(coveredRegion)) { - currentLocus = locusIterator.next(); - nextFilePointer.addLocation(currentLocus.intersect(coveredRegion)); - } - - // Chop off the uncovered portion of the locus. Since we know that the covered region overlaps the current locus, - // we can simplify the interval creation process to the end of the covered region to the stop of the given interval. - if(coveredRegionStop < currentLocus.getStop()) - currentLocus = loci.getGenomeLocParser().createGenomeLoc(currentLocus.getContig(),coveredRegionStop+1,currentLocus.getStop()); - else if(locusIterator.hasNext()) - currentLocus = locusIterator.next(); - else - currentLocus = null; - } - - } - } - - - /** - * The last reference sequence processed by this iterator. - */ - private Integer lastReferenceSequenceLoaded = null; - - /** - * The stateful iterator used to progress through the genoem. - */ - private PeekableIterator bamScheduleIterator = null; - - /** - * Clean up underlying BAMSchedule file handles. - */ - public void close() { - if(bamScheduleIterator != null) - bamScheduleIterator.close(); - } - - /** - * Get the next overlapping tree of bins associated with the given BAM file. - * @param currentLocus The actual locus for which to check overlap. - * @return The next schedule entry overlapping with the given list of loci. - */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final GenomeLoc currentLocus) { - // Make sure that we consult the BAM header to ensure that we're using the correct contig index for this contig name. - // This will ensure that if the two sets of contigs don't quite match (b36 male vs female ref, hg19 Epstein-Barr), then - // we'll be using the correct contig index for the BAMs. - // TODO: Warning: assumes all BAMs use the same sequence dictionary! Get around this with contig aliasing. - SAMSequenceRecord currentContigSequenceRecord = dataSource.getHeader().getSequence(currentLocus.getContig()); - if ( currentContigSequenceRecord == null ) { - throw new UserException(String.format("Contig %s not present in sequence dictionary for merged BAM header: %s", - currentLocus.getContig(), - ReadUtils.prettyPrintSequenceRecords(dataSource.getHeader().getSequenceDictionary()))); - } - - final int currentContigIndex = currentContigSequenceRecord.getSequenceIndex(); - - // Stale reference sequence or first invocation. (Re)create the binTreeIterator. - if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentContigIndex) { - if(bamScheduleIterator != null) - bamScheduleIterator.close(); - lastReferenceSequenceLoaded = currentContigIndex; - - // Naive algorithm: find all elements in current contig for proper schedule creation. - List lociInContig = new LinkedList(); - for(GenomeLoc locus: loci) { - if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()) == null) - throw new ReviewedGATKException("BAM file(s) do not have the contig: " + locus.getContig() + ". You are probably using a different reference than the one this file was aligned with"); - - if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded) - lociInContig.add(locus); - } - - bamScheduleIterator = new PeekableIterator(new BAMSchedule(dataSource,lociInContig)); - } - - if(!bamScheduleIterator.hasNext()) - return null; - - // Peek the iterator along until finding the first binTree at or following the current locus. - BAMScheduleEntry bamScheduleEntry = bamScheduleIterator.peek(); - while(bamScheduleEntry != null && bamScheduleEntry.isBefore(currentLocus)) { - bamScheduleIterator.next(); - bamScheduleEntry = bamScheduleIterator.hasNext() ? bamScheduleIterator.peek() : null; - } - - return (bamScheduleEntry != null && bamScheduleEntry.overlaps(currentLocus)) ? bamScheduleEntry : null; - } - - /** - * Create a span from the given start point to the end of the file. - * @param startOfRegion Start of the region, in encoded coordinates (block start << 16 & block offset). - * @return A file span from the given point to the end of the file. - */ - private GATKBAMFileSpan createSpanToEndOfFile(final long startOfRegion) { - return new GATKBAMFileSpan(new GATKChunk(startOfRegion,Long.MAX_VALUE)); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java deleted file mode 100644 index 11fecb661..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/BlockInputStream.java +++ /dev/null @@ -1,450 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.util.BlockCompressedInputStream; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.IOException; -import java.io.InputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -/** - * Presents decompressed blocks to the SAMFileReader. - */ -public class BlockInputStream extends InputStream { - /** - * Mechanism for triggering block loads. - */ - private final BGZFBlockLoadingDispatcher dispatcher; - - /** - * The reader whose data is supplied by this input stream. - */ - private final SAMReaderID reader; - - /** - * Length of the input stream. - */ - private final long length; - - /** - * The latest error reported by an asynchronous block load. - */ - private Throwable error; - - /** - * Current accessPlan. - */ - private BAMAccessPlan accessPlan; - - /** - * A stream of compressed data blocks. - */ - private final ByteBuffer buffer; - - /** - * Offsets of the given blocks in the buffer. - */ - private LinkedList blockOffsets = new LinkedList(); - - /** - * Source positions of the given blocks in the buffer. - */ - private LinkedList blockPositions = new LinkedList(); - - /** - * Provides a lock to wait for more data to arrive. - */ - private final Object lock = new Object(); - - /** - * An input stream to use when comparing data back to what it should look like. - */ - private final BlockCompressedInputStream validatingInputStream; - - /** - * Create a new block presenting input stream with a dedicated buffer. - * @param dispatcher the block loading messenger. - * @param reader the reader for which to load data. - * @param validate validates the contents read into the buffer against the contents of a Picard BlockCompressedInputStream. - */ - BlockInputStream(final BGZFBlockLoadingDispatcher dispatcher, final SAMReaderID reader, final boolean validate) { - this.reader = reader; - this.length = reader.samFile.length(); - - buffer = ByteBuffer.wrap(new byte[64*1024]); - buffer.order(ByteOrder.LITTLE_ENDIAN); - - // The state of the buffer assumes that the range of data written into the buffer appears in the range - // [position,limit), while extra capacity exists in the range [limit,capacity) - buffer.limit(0); - - this.dispatcher = dispatcher; - // TODO: Kill the region when all we want to do is start at the beginning of the stream and run to the end of the stream. - this.accessPlan = new BAMAccessPlan(reader,this,new GATKBAMFileSpan(new GATKChunk(0,Long.MAX_VALUE))); - - // The block offsets / block positions guarantee that the ending offset/position in the data structure maps to - // the point in the file just following the last read. These two arrays should never be empty; initializing - // to 0 to match the position above. - this.blockOffsets.add(0); - this.blockPositions.add(0L); - - try { - if(validate) { - System.out.printf("BlockInputStream %s: BGZF block validation mode activated%n",this); - validatingInputStream = new BlockCompressedInputStream(reader.samFile); - // A bug in ValidatingInputStream means that calling getFilePointer() immediately after initialization will result in an NPE. - // Poke the stream to start reading data. - validatingInputStream.available(); - } - else - validatingInputStream = null; - } - catch(IOException ex) { - throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); - } - } - - public long length() { - return length; - } - - public long getFilePointer() { - long filePointer; - synchronized(lock) { - // Find the current block within the input stream. - int blockIndex; - for(blockIndex = 0; blockIndex+1 < blockOffsets.size() && buffer.position() > blockOffsets.get(blockIndex+1); blockIndex++) - ; - filePointer = blockPositions.get(blockIndex) + (buffer.position()-blockOffsets.get(blockIndex)); - } - -// if(validatingInputStream != null && filePointer != validatingInputStream.getFilePointer()) -// throw new ReviewedGATKException(String.format("Position of input stream is invalid; expected (block address, block offset) = (%d,%d), got (%d,%d)", -// BlockCompressedFilePointerUtil.getBlockAddress(validatingInputStream.getFilePointer()),BlockCompressedFilePointerUtil.getBlockOffset(validatingInputStream.getFilePointer()), -// BlockCompressedFilePointerUtil.getBlockAddress(filePointer),BlockCompressedFilePointerUtil.getBlockOffset(filePointer))); - - return filePointer; - } - - private void clearBuffers() { - this.accessPlan.reset(); - - // Buffer semantics say that outside of a lock, buffer should always be prepared for reading. - // Indicate no data to be read. - buffer.clear(); - buffer.limit(0); - - // Clear everything except the last block offset / position - blockOffsets.clear(); - blockOffsets.add(0); - while(blockPositions.size() > 1) - blockPositions.removeFirst(); - } - - public boolean eof() { - synchronized(lock) { - // TODO: Handle multiple empty BGZF blocks at end of the file. - return accessPlan != null && (accessPlan.getBlockAddress() < 0 || accessPlan.getBlockAddress() >= length); - } - } - - /** - * Submits a new access plan for the given dataset and seeks to the given point. - * @param accessPlan The next seek point for BAM data in this reader. - */ - public void submitAccessPlan(final BAMAccessPlan accessPlan) { - //System.out.printf("Thread %s: submitting access plan for block at position: %d%n",Thread.currentThread().getId(),position.getBlockAddress()); - this.accessPlan = accessPlan; - accessPlan.reset(); - - clearBuffers(); - - // Pull the iterator past any oddball chunks at the beginning of the shard (chunkEnd < chunkStart, empty chunks, etc). - // TODO: Don't pass these empty chunks in. - accessPlan.advancePosition(makeFilePointer(accessPlan.getBlockAddress(),0)); - - if(accessPlan.getBlockAddress() >= 0) { - waitForBufferFill(); - } - - if(validatingInputStream != null) { - try { - validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(),0)); - } - catch(IOException ex) { - throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); - } - } - - } - - - private void compactBuffer() { - // Compact buffer to maximize storage space. - int bytesToRemove = 0; - - // Look ahead to see if we can compact away the first blocks in the series. - while(blockOffsets.size() > 1 && buffer.position() >= blockOffsets.get(1)) { - blockOffsets.remove(); - blockPositions.remove(); - bytesToRemove = blockOffsets.peek(); - } - - // If we end up with an empty block at the end of the series, compact this as well. - if(buffer.remaining() == 0 && blockOffsets.size() > 1 && buffer.position() >= blockOffsets.peek()) { - bytesToRemove += buffer.position(); - blockOffsets.remove(); - blockPositions.remove(); - } - - int finalBufferStart = buffer.position() - bytesToRemove; - int finalBufferSize = buffer.remaining(); - - // Position the buffer to remove the unneeded data, and compact it away. - buffer.position(bytesToRemove); - buffer.compact(); - - // Reset the limits for reading. - buffer.position(finalBufferStart); - buffer.limit(finalBufferStart+finalBufferSize); - - // Shift everything in the offset buffer down to accommodate the bytes removed from the buffer. - for(int i = 0; i < blockOffsets.size(); i++) - blockOffsets.set(i,blockOffsets.get(i)-bytesToRemove); - } - - /** - * Push contents of incomingBuffer into the end of this buffer. - * MUST be called from a thread that is NOT the reader thread. - * @param incomingBuffer The data being pushed into this input stream. - * @param accessPlan target access plan for the data. - * @param filePosition the current position of the file pointer - */ - public void copyIntoBuffer(final ByteBuffer incomingBuffer, final BAMAccessPlan accessPlan, final long filePosition) { - synchronized(lock) { - try { - if(validatingInputStream != null) { - byte[] validBytes = new byte[incomingBuffer.remaining()]; - - byte[] currentBytes = new byte[incomingBuffer.remaining()]; - int pos = incomingBuffer.position(); - int lim = incomingBuffer.limit(); - incomingBuffer.get(currentBytes); - - incomingBuffer.limit(lim); - incomingBuffer.position(pos); - - long currentFilePointer = validatingInputStream.getFilePointer(); - validatingInputStream.seek(makeFilePointer(accessPlan.getBlockAddress(), 0)); - validatingInputStream.read(validBytes); - validatingInputStream.seek(currentFilePointer); - - if(!Arrays.equals(validBytes,currentBytes)) - throw new ReviewedGATKException(String.format("Bytes being inserted into BlockInputStream %s are incorrect",this)); - } - - compactBuffer(); - // Open up the buffer for more reading. - buffer.limit(buffer.capacity()); - - // Get the spans overlapping this particular block... - List spansOverlapping = accessPlan.getSpansOverlappingBlock(accessPlan.getBlockAddress(),filePosition); - - // ...and advance the block - this.accessPlan = accessPlan; - accessPlan.advancePosition(makeFilePointer(filePosition, 0)); - - if(buffer.remaining() < incomingBuffer.remaining()) - lock.wait(); - - final int bytesInIncomingBuffer = incomingBuffer.limit(); - - for(GATKChunk spanOverlapping: spansOverlapping) { - // Clear out the endcap tracking state and add in the starting position for this transfer. - blockOffsets.removeLast(); - blockOffsets.add(buffer.position()); - blockPositions.removeLast(); - blockPositions.add(spanOverlapping.getChunkStart()); - - // Stream the buffer into the data stream. - incomingBuffer.limit((spanOverlapping.getBlockEnd() > spanOverlapping.getBlockStart()) ? bytesInIncomingBuffer : spanOverlapping.getBlockOffsetEnd()); - incomingBuffer.position(spanOverlapping.getBlockOffsetStart()); - buffer.put(incomingBuffer); - - // Add the endcap for this transfer. - blockOffsets.add(buffer.position()); - blockPositions.add(spanOverlapping.getChunkEnd()); - } - - // Set up the buffer for reading. - buffer.flip(); - - lock.notify(); - } - catch(Exception ex) { - reportException(ex); - lock.notify(); - } - } - } - - void reportException(Throwable t) { - synchronized(lock) { - this.error = t; - lock.notify(); - } - } - - private void checkForErrors() { - synchronized(lock) { - if(error != null) { - ReviewedGATKException toThrow = new ReviewedGATKException(String.format("Thread %s, BlockInputStream %s: Unable to retrieve BAM data from disk",Thread.currentThread().getId(),this),error); - toThrow.setStackTrace(error.getStackTrace()); - throw toThrow; - } - } - } - - /** - * Reads the next byte of data from the input stream. - * @return Next byte of data, from 0->255, as an int. - */ - @Override - public int read() { - byte[] singleByte = new byte[1]; - read(singleByte); - return singleByte[0]; - } - - /** - * Fills the given byte array to the extent possible. - * @param bytes byte array to be filled. - * @return The number of bytes actually read. - */ - @Override - public int read(byte[] bytes) { - return read(bytes,0,bytes.length); - } - - @Override - public int read(byte[] bytes, final int offset, final int length) { - int remaining = length; - synchronized(lock) { - while(remaining > 0) { - // Check for error conditions during last read. - checkForErrors(); - - // If completely out of space, queue up another buffer fill. - waitForBufferFill(); - - // Couldn't manage to load any data at all; abort and return what's available. - if(buffer.remaining() == 0) - break; - - int numBytesToCopy = Math.min(buffer.remaining(),remaining); - buffer.get(bytes,length-remaining+offset,numBytesToCopy); - remaining -= numBytesToCopy; - - //if(remaining > 0) - // System.out.printf("Thread %s: read the first %d bytes of a %d byte request%n",Thread.currentThread().getId(),length-remaining,length); - // TODO: Assert that we don't copy across a block boundary - } - - // Notify any waiting threads that some of the contents of the buffer were removed. - if(length-remaining > 0) - lock.notify(); - } - -// if(validatingInputStream != null) { -// byte[] validBytes = new byte[length]; -// try { -// validatingInputStream.read(validBytes,offset,length); -// for(int i = offset; i < offset+length; i++) { -// if(bytes[i] != validBytes[i]) -// throw new ReviewedGATKException(String.format("Thread %s: blockInputStream %s attempting to return wrong set of bytes; mismatch at offset %d",Thread.currentThread().getId(),this,i)); -// } -// } -// catch(IOException ex) { -// throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); -// } -// } - - // If any data was copied into the buffer, return the amount of data copied. - if(remaining < length) - return length - remaining; - - // Otherwise, return -1. - return -1; - } - - public void close() { - if(validatingInputStream != null) { - try { - validatingInputStream.close(); - } - catch(IOException ex) { - throw new ReviewedGATKException("Unable to validate against Picard input stream",ex); - } - } - } - - public String getSource() { - return reader.getSamFilePath(); - } - - private void waitForBufferFill() { - synchronized(lock) { - if(buffer.remaining() == 0 && !eof()) { - //System.out.printf("Thread %s is waiting for a buffer fill from position %d to buffer %s%n",Thread.currentThread().getId(),position.getBlockAddress(),this); - dispatcher.queueBlockLoad(accessPlan); - try { - lock.wait(); - } - catch(InterruptedException ex) { - throw new ReviewedGATKException("Interrupt occurred waiting for buffer to fill",ex); - } - } - } - } - - /** - * Create an encoded BAM file pointer given the address of a BGZF block and an offset. - * @param blockAddress Physical address on disk of a BGZF block. - * @param blockOffset Offset into the uncompressed data stored in the BGZF block. - * @return 64-bit pointer encoded according to the BAM spec. - */ - public static long makeFilePointer(final long blockAddress, final int blockOffset) { - return blockAddress << 16 | blockOffset; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java deleted file mode 100644 index 8d5ab3b03..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FileHandleCache.java +++ /dev/null @@ -1,232 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.GATKException; - -import java.io.FileInputStream; -import java.io.IOException; -import java.util.Collection; -import java.util.HashMap; -import java.util.LinkedHashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Queue; - -/** - * Caches frequently used file handles. Right now, caches only a single file handle. - * TODO: Generalize to support arbitrary file handle caches. - */ -public class FileHandleCache { - /** - * The underlying data structure storing file handles. - */ - private final FileHandleStorage fileHandleStorage; - - /** - * How many file handles should be kept open at once. - */ - private final int cacheSize; - - /** - * A uniquifier: assign a unique ID to every instance of a file handle. - */ - private final Map keyCounter = new HashMap(); - - /** - * A shared lock, private so that outside users cannot notify it. - */ - private final Object lock = new Object(); - - /** - * Indicates how many file handles are outstanding at this point. - */ - private int numOutstandingFileHandles = 0; - - /** - * Create a new file handle cache of the given cache size. - * @param cacheSize how many readers to hold open at once. - */ - public FileHandleCache(final int cacheSize) { - this.cacheSize = cacheSize; - fileHandleStorage = new FileHandleStorage(); - } - - /** - * Retrieves or opens a file handle for the given reader ID. - * @param key The ke - * @return A file input stream from the cache, if available, or otherwise newly opened. - */ - public FileInputStream claimFileInputStream(final SAMReaderID key) { - synchronized(lock) { - FileInputStream inputStream = findExistingEntry(key); - if(inputStream == null) { - try { - // If the cache is maxed out, wait for another file handle to emerge. - if(numOutstandingFileHandles >= cacheSize) - lock.wait(); - } - catch(InterruptedException ex) { - throw new ReviewedGATKException("Interrupted while waiting for a file handle"); - } - inputStream = openInputStream(key); - } - numOutstandingFileHandles++; - - //System.out.printf("Handing input stream %s to thread %s%n",inputStream,Thread.currentThread().getId()); - return inputStream; - } - } - - /** - * Releases the current reader and returns it to the cache. - * @param key The reader. - * @param inputStream The stream being used. - */ - public void releaseFileInputStream(final SAMReaderID key, final FileInputStream inputStream) { - synchronized(lock) { - numOutstandingFileHandles--; - UniqueKey newID = allocateKey(key); - fileHandleStorage.put(newID,inputStream); - // Let any listeners know that another file handle has become available. - lock.notify(); - } - } - - /** - * Finds an existing entry in the storage mechanism. - * @param key Reader. - * @return a cached stream, if available. Otherwise, - */ - private FileInputStream findExistingEntry(final SAMReaderID key) { - int existingHandles = getMostRecentUniquifier(key); - - // See if any of the keys currently exist in the repository. - for(int i = 0; i <= existingHandles; i++) { - UniqueKey uniqueKey = new UniqueKey(key,i); - if(fileHandleStorage.containsKey(uniqueKey)) - return fileHandleStorage.remove(uniqueKey); - } - - return null; - } - - /** - * Gets the most recent uniquifier used for the given reader. - * @param reader Reader for which to determine uniqueness. - * @return - */ - private int getMostRecentUniquifier(final SAMReaderID reader) { - if(keyCounter.containsKey(reader)) - return keyCounter.get(reader); - else return -1; - } - - private UniqueKey allocateKey(final SAMReaderID reader) { - int uniquifier = getMostRecentUniquifier(reader)+1; - keyCounter.put(reader,uniquifier); - return new UniqueKey(reader,uniquifier); - } - - private FileInputStream openInputStream(final SAMReaderID reader) { - try { - return new FileInputStream(reader.getSamFilePath()); - } - catch(IOException ex) { - throw new GATKException("Unable to open input file"); - } - } - - private void closeInputStream(final FileInputStream inputStream) { - try { - inputStream.close(); - } - catch(IOException ex) { - throw new GATKException("Unable to open input file"); - } - } - - /** - * Actually contains the file handles, purging them as they get too old. - */ - private class FileHandleStorage extends LinkedHashMap { - /** - * Remove the oldest entry - * @param entry Entry to consider removing. - * @return True if the cache size has been exceeded. False otherwise. - */ - @Override - protected boolean removeEldestEntry(Map.Entry entry) { - synchronized (lock) { - if(size() > cacheSize) { - keyCounter.put(entry.getKey().key,keyCounter.get(entry.getKey().key)-1); - closeInputStream(entry.getValue()); - - return true; - } - } - return false; - } - } - - /** - * Uniquifies a key by adding a numerical uniquifier. - */ - private class UniqueKey { - /** - * The file handle's key. - */ - private final SAMReaderID key; - - /** - * A uniquifier, so that multiple of the same reader can exist in the cache. - */ - private final int uniqueID; - - public UniqueKey(final SAMReaderID reader, final int uniqueID) { - this.key = reader; - this.uniqueID = uniqueID; - } - - @Override - public boolean equals(Object other) { - if(!(other instanceof UniqueKey)) - return false; - UniqueKey otherUniqueKey = (UniqueKey)other; - return key.equals(otherUniqueKey.key) && this.uniqueID == otherUniqueKey.uniqueID; - } - - @Override - public int hashCode() { - return key.hashCode(); - } - } - - - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java deleted file mode 100644 index 99d9def5a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointer.java +++ /dev/null @@ -1,436 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import htsjdk.samtools.SAMFileSpan; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; - -import java.util.*; - -/** - * Represents a small section of a BAM file, and every associated interval. - */ -public class FilePointer { - protected final SortedMap fileSpans = new TreeMap(); - protected final List locations = new ArrayList(); - protected final IntervalMergingRule intervalMergingRule; - - /** - * Does this file pointer point into an unmapped region? - */ - protected final boolean isRegionUnmapped; - - /** - * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will - * ever visit during this GATK run? If this is set to true, the engine will expect to see only this - * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals - * from more than one contig. - */ - private boolean isMonolithic = false; - - /** - * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers - */ - private Integer contigIndex = null; - - - public FilePointer( final IntervalMergingRule mergeRule, final List locations ) { - this.intervalMergingRule = mergeRule; - this.locations.addAll(locations); - this.isRegionUnmapped = checkUnmappedStatus(); - - validateAllLocations(); - if ( locations.size() > 0 ) { - contigIndex = locations.get(0).getContigIndex(); - } - } - - public FilePointer( final IntervalMergingRule mergeRule, final GenomeLoc... locations ) { - this(mergeRule, Arrays.asList(locations)); - } - - public FilePointer( final Map fileSpans, final IntervalMergingRule mergeRule, final List locations ) { - this(mergeRule, locations); - this.fileSpans.putAll(fileSpans); - } - - private boolean checkUnmappedStatus() { - boolean foundMapped = false, foundUnmapped = false; - - for( GenomeLoc location: locations ) { - if ( GenomeLoc.isUnmapped(location) ) - foundUnmapped = true; - else - foundMapped = true; - } - if ( foundMapped && foundUnmapped ) - throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); - - return foundUnmapped; - } - - private void validateAllLocations() { - // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction - if ( isRegionUnmapped || isMonolithic ) { - return; - } - - Integer previousContigIndex = null; - - for ( GenomeLoc location : locations ) { - if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) { - throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); - } - - previousContigIndex = location.getContigIndex(); - } - } - - private void validateLocation( GenomeLoc location ) { - if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) { - throw new ReviewedGATKException("BUG: File pointers cannot be mixed mapped/unmapped."); - } - if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) { - throw new ReviewedGATKException("Non-monolithic file pointers must contain intervals from at most one contig"); - } - } - - /** - * Returns an immutable view of this FilePointer's file spans - * - * @return an immutable view of this FilePointer's file spans - */ - public Map getFileSpans() { - return Collections.unmodifiableMap(fileSpans); - } - - /** - * Returns an immutable variant of the list of locations. - * @return - */ - public List getLocations() { - return Collections.unmodifiableList(locations); - } - - /** - * Returns the index of the contig into which this FilePointer points (a FilePointer can represent - * regions in at most one contig). - * - * @return the index of the contig into which this FilePointer points - */ - public int getContigIndex() { - return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; - } - - /** - * Returns the IntervalMergingRule used by this FilePointer to merge adjacent locations - * - * @return the IntervalMergingRule used by this FilePointer (never null) - */ - public IntervalMergingRule getIntervalMergingRule() { - return intervalMergingRule; - } - - /** - * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will - * ever visit during this GATK run? If this is set to true, the engine will expect to see only this - * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals - * from more than one contig. - * - * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false - */ - public boolean isMonolithic() { - return isMonolithic; - } - - /** - * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all - * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic - * FP may contain intervals from more than one contig. - * - * @param isMonolithic set this FP's monolithic status to this value - */ - public void setIsMonolithic( boolean isMonolithic ) { - this.isMonolithic = isMonolithic; - } - - @Override - public boolean equals(final Object other) { - if(!(other instanceof FilePointer)) - return false; - FilePointer otherFilePointer = (FilePointer)other; - - // intervals - if(this.locations.size() != otherFilePointer.locations.size()) - return false; - for(int i = 0; i < locations.size(); i++) { - if(!this.locations.get(i).equals(otherFilePointer.locations.get(i))) - return false; - } - - // fileSpans - if(this.fileSpans.size() != otherFilePointer.fileSpans.size()) - return false; - Iterator> thisEntries = this.fileSpans.entrySet().iterator(); - Iterator> otherEntries = otherFilePointer.fileSpans.entrySet().iterator(); - while(thisEntries.hasNext() || otherEntries.hasNext()) { - if(!thisEntries.next().equals(otherEntries.next())) - return false; - } - - return true; - } - - public void addLocation(final GenomeLoc location) { - validateLocation(location); - - this.locations.add(location); - if ( contigIndex == null ) { - contigIndex = location.getContigIndex(); - } - } - - public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { - this.fileSpans.put(id,fileSpan); - } - - public void addFileSpans(final Map fileSpans) { - this.fileSpans.putAll(fileSpans); - } - - - /** - * Computes the size of this file span, in uncompressed bytes. - * @return Size of the file span. - */ - public long size() { - long size = 0L; - for(SAMFileSpan fileSpan: fileSpans.values()) - size += ((GATKBAMFileSpan)fileSpan).size(); - return size; - } - - /** - * Returns the difference in size between two filespans. - * @param other Other filespan against which to measure. - * @return The difference in size between the two file pointers. - */ - public long minus(final FilePointer other) { - long difference = 0; - PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); - PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); - - while(thisIterator.hasNext()) { - // If there are no elements left in the 'other' iterator, spin out this iterator. - if(!otherIterator.hasNext()) { - GATKBAMFileSpan nextSpan = (GATKBAMFileSpan)thisIterator.next().getValue(); - difference += nextSpan.size(); - continue; - } - - // Otherwise, compare the latest value. - int compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); - - if(compareValue < 0) { - // This before other. - difference += ((GATKBAMFileSpan)thisIterator.next().getValue()).size(); - } - else if(compareValue > 0) { - // Other before this. - difference += ((GATKBAMFileSpan)otherIterator.next().getValue()).size(); - } - else { - // equality; difference the values. - GATKBAMFileSpan thisRegion = (GATKBAMFileSpan)thisIterator.next().getValue(); - GATKBAMFileSpan otherRegion = (GATKBAMFileSpan)otherIterator.next().getValue(); - difference += Math.abs(thisRegion.minus(otherRegion).size()); - } - } - return difference; - } - - /** - * Combines two file pointers into one. - * @param parser The genomelocparser to use when manipulating intervals. - * @param other File pointer to combine into this one. - * @return A completely new file pointer that is the combination of the two. - */ - public FilePointer combine(final GenomeLocParser parser, final FilePointer other) { - FilePointer combined = new FilePointer(intervalMergingRule); - - List intervals = new ArrayList(); - intervals.addAll(locations); - intervals.addAll(other.locations); - for(GenomeLoc interval: IntervalUtils.sortAndMergeIntervals(parser,intervals,intervalMergingRule)) - combined.addLocation(interval); - - PeekableIterator> thisIterator = new PeekableIterator>(this.fileSpans.entrySet().iterator()); - PeekableIterator> otherIterator = new PeekableIterator>(other.fileSpans.entrySet().iterator()); - - while(thisIterator.hasNext() || otherIterator.hasNext()) { - int compareValue; - if(!otherIterator.hasNext()) { - compareValue = -1; - } - else if(!thisIterator.hasNext()) - compareValue = 1; - else - compareValue = thisIterator.peek().getKey().compareTo(otherIterator.peek().getKey()); - - // This before other. - if(compareValue < 0) - mergeElementsInto(combined,thisIterator); - // Other before this. - else if(compareValue > 0) - mergeElementsInto(combined,otherIterator); - // equality; union the values. - else - mergeElementsInto(combined,thisIterator,otherIterator); - } - return combined; - } - - /** - * Roll the next element in the iterator into the combined entry. - * @param combined Entry into which to roll the next element. - * @param iterators Sources of next elements. - */ - private void mergeElementsInto(final FilePointer combined, Iterator>... iterators) { - if(iterators.length == 0) - throw new ReviewedGATKException("Tried to add zero elements to an existing file pointer."); - Map.Entry initialElement = iterators[0].next(); - GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)initialElement.getValue(); - for(int i = 1; i < iterators.length; i++) - fileSpan = fileSpan.union((GATKBAMFileSpan)iterators[i].next().getValue()); - combined.addFileSpans(initialElement.getKey(),fileSpan); - } - - /** - * Efficiently generate the union of the n FilePointers passed in. Much more efficient than - * combining two FilePointers at a time using the combine() method above. - * - * IMPORTANT: the FilePointers to be unioned must either all represent regions on the - * same contig, or all be unmapped, since we cannot create FilePointers with a mix of - * contigs or with mixed mapped/unmapped regions. - * - * @param filePointers the FilePointers to union - * @param parser our GenomeLocParser - * @return the union of the FilePointers passed in - */ - public static FilePointer union( List filePointers, GenomeLocParser parser ) { - if ( filePointers == null || filePointers.isEmpty() ) { - return new FilePointer(IntervalMergingRule.ALL); - } - - Map> fileChunks = new HashMap>(); - List locations = new ArrayList(); - IntervalMergingRule mergeRule = filePointers.get(0).getIntervalMergingRule(); - - // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections - for ( FilePointer filePointer : filePointers ) { - locations.addAll(filePointer.getLocations()); - if (mergeRule != filePointer.getIntervalMergingRule()) - throw new ReviewedGATKException("All FilePointers in FilePointer.union() must have use the same IntervalMergeRule"); - - for ( Map.Entry fileSpanEntry : filePointer.getFileSpans().entrySet() ) { - GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue(); - - if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) { - fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks()); - } - else { - fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks()); - } - } - } - - // Now sort and merge the intervals - List sortedMergedLocations = new ArrayList(); - sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, mergeRule)); - - // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing - // the sorted, merged union of the chunks for that file - Map mergedFileSpans = new HashMap(fileChunks.size()); - for ( Map.Entry> fileChunksEntry : fileChunks.entrySet() ) { - List unmergedChunks = fileChunksEntry.getValue(); - mergedFileSpans.put(fileChunksEntry.getKey(), - (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan())); - } - - return new FilePointer(mergedFileSpans, mergeRule, sortedMergedLocations); - } - - /** - * Returns true if any of the file spans in this FilePointer overlap their counterparts in - * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region - * from the start of the first chunk to the end of the last chunk). - * - * @param other the FilePointer against which to check overlap with this FilePointer - * @return true if any file spans overlap their counterparts in other, otherwise false - */ - public boolean hasFileSpansOverlappingWith( FilePointer other ) { - for ( Map.Entry thisFilePointerEntry : fileSpans.entrySet() ) { - GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue()); - - SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey()); - if ( otherEntry == null ) { - continue; // no counterpart for this file span in other - } - GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry); - - if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) { - return true; - } - } - - return false; - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("FilePointer:\n"); - builder.append("\tlocations = {"); - builder.append(Utils.join(";",locations)); - builder.append("}\n\tregions = \n"); - for(Map.Entry entry: fileSpans.entrySet()) { - builder.append(entry.getKey()); - builder.append("= {"); - builder.append(entry.getValue()); - builder.append("}"); - } - return builder.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java deleted file mode 100644 index 28d4faf2c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/LocusShard.java +++ /dev/null @@ -1,60 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.SAMFileSpan; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.Utils; - -import java.util.List; -import java.util.Map; - -/** - * Handles locus shards of BAM information. - * @author aaron - * @version 1.0 - * @date Apr 7, 2009 - */ -public class LocusShard extends Shard { - /** - * Create a new locus shard, divided by index. - * @param intervals List of intervals to process. - * @param fileSpans File spans associated with that interval. - */ - public LocusShard(GenomeLocParser parser, SAMDataSource dataSource, List intervals, Map fileSpans) { - super(parser, ShardType.LOCUS, intervals, dataSource, fileSpans, false); - } - - /** - * String representation of this shard. - * @return A string representation of the boundaries of this shard. - */ - @Override - public String toString() { - return Utils.join(";",getGenomeLocs()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java deleted file mode 100644 index d4321da3b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShard.java +++ /dev/null @@ -1,271 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIteratorAdapter; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 5:03:13 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * Expresses a shard of read data in block format. - * - * @author mhanna - * @version 0.1 - */ -public class ReadShard extends Shard { - - /** - * Default read shard buffer size - */ - public static final int DEFAULT_MAX_READS = 10000; - - /** - * What is the maximum number of reads per BAM file which should go into a read shard. - * - * TODO: this non-final static variable should either be made final or turned into an - * TODO: instance variable somewhere -- as both static and mutable it wreaks havoc - * TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource - * TODO: changes this value) - */ - public static int MAX_READS = DEFAULT_MAX_READS; - - /** - * The reads making up this shard. - */ - private final Collection reads = new ArrayList(MAX_READS); - - public ReadShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { - super(parser, ShardType.READ, loci, readsDataSource, fileSpans, isUnmapped); - } - - /** - * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface - * until we know what effect tuning this parameter has. - * - * TODO: this mutable static interface is awful and breaks tests -- need to refactor - * - * @param bufferSize New maximum number - */ - static void setReadBufferSize(final int bufferSize) { - MAX_READS = bufferSize; - } - - /** - * What read buffer size are we using? - * - * @return - */ - public static int getReadBufferSize() { - return MAX_READS; - } - - /** - * Returns true if this shard is meant to buffer reads, rather - * than just holding pointers to their locations. - * @return True if this shard can buffer reads. False otherwise. - */ - public boolean buffersReads() { - return true; - } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferEmpty() { - return reads.size() == 0; - } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferFull() { - return reads.size() > ReadShard.MAX_READS; - } - - /** - * Adds a read to the read buffer. - * @param read Add a read to the internal shard buffer. - */ - public void addRead(SAMRecord read) { - // DO NOT validate that the buffer is full. Paired read sharding will occasionally have to stuff another - // read or two into the buffer. - reads.add(read); - } - - /** - * Fills this shard's buffer with reads from the iterator passed in - * - * @param readIter Iterator from which to draw the reads to fill the shard - */ - @Override - public void fill( PeekableIterator readIter ) { - if( ! buffersReads() ) - throw new ReviewedGATKException("Attempting to fill a non-buffering shard."); - - SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder(); - SAMRecord read = null; - - while( ! isBufferFull() && readIter.hasNext() ) { - final SAMRecord nextRead = readIter.peek(); - if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) { - // only add reads to the shard if they are on the same contig - read = readIter.next(); - addRead(read); - } else { - break; - } - } - - // If the reads are sorted in coordinate order, ensure that all reads - // having the same alignment start become part of the same shard, to allow - // downsampling to work better across shard boundaries. Note that because our - // read stream has already been fed through the positional downsampler, which - // ensures that at each alignment start position there are no more than dcov - // reads, we're in no danger of accidentally creating a disproportionately huge - // shard - if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) { - while ( readIter.hasNext() ) { - SAMRecord additionalRead = readIter.peek(); - - // Stop filling the shard as soon as we encounter a read having a different - // alignment start or contig from the last read added in the earlier loop - // above, or an unmapped read - if ( read == null || - additionalRead.getReadUnmappedFlag() || - ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || - additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { - break; - } - - addRead(readIter.next()); - } - } - - // If the reads are sorted in queryname order, ensure that all reads - // having the same queryname become part of the same shard. - if( sortOrder == SAMFileHeader.SortOrder.queryname ) { - while( readIter.hasNext() ) { - SAMRecord nextRead = readIter.peek(); - if( read == null || ! read.getReadName().equals(nextRead.getReadName()) ) - break; - addRead(readIter.next()); - } - } - } - - /** - * Creates an iterator over reads stored in this shard's read cache. - * @return - */ - public GATKSAMIterator iterator() { - return GATKSAMIteratorAdapter.adapt(reads.iterator()); - } - - /** - * String representation of this shard. - * @return A string representation of the boundaries of this shard. - */ - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - for(Map.Entry entry: getFileSpans().entrySet()) { - sb.append(entry.getKey()); - sb.append(": "); - sb.append(entry.getValue()); - sb.append(' '); - } - return sb.toString(); - } - - /** - * Get the full span from the start of the left most read to the end of the right most one - * - * Note this may be different than the getLocation() of the shard, as this reflects the - * targeted span, not the actual span of reads - * - * @return the genome loc representing the span of these reads on the genome - */ - public GenomeLoc getReadsSpan() { - if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() ) - return super.getLocation(); - else { - int start = Integer.MAX_VALUE; - int stop = Integer.MIN_VALUE; - String contig = null; - boolean foundMapped = false; - - for ( final SAMRecord read : reads ) { - if ( contig != null && ! read.getReferenceName().equals(contig) ) - throw new ReviewedGATKException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. " - + "First contig is " + contig + " next read was " + read.getReferenceName() ); - contig = read.getReferenceName(); - - // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates - // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries, - // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment - // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only* - // with unmapped mates. - if ( ! read.getReadUnmappedFlag() ) { - foundMapped = true; - if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart(); - if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd(); - } - } - - assert contig != null; - - if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped - return GenomeLoc.UNMAPPED; - else - return parser.createGenomeLoc(contig, start, stop); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java deleted file mode 100644 index 0fc06fcce..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSource.java +++ /dev/null @@ -1,1179 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.MergingSamRecordIterator; -import htsjdk.samtools.SamFileHeaderMerger; -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.RuntimeIOException; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.ReadMetrics; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.downsampling.*; -import org.broadinstitute.gatk.engine.filters.CountingFilteringIterator; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.*; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.SimpleTimer; -import org.broadinstitute.gatk.utils.baq.ReadTransformingIterator; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory; - -import java.io.File; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; -import java.util.*; -import java.util.concurrent.Callable; - -/** - * User: aaron - * Date: Mar 26, 2009 - * Time: 2:36:16 PM - *

- * Converts shards to SAM iterators over the specified region - */ -public class SAMDataSource { - final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); - - /** Backing support for reads. */ - protected final ReadProperties readProperties; - - /** - * Runtime metrics of reads filtered, etc. - */ - private final ReadMetrics readMetrics; - - /** - * Tools for parsing GenomeLocs, for verifying BAM ordering against general ordering. - */ - protected final GenomeLocParser genomeLocParser; - - /** - * Identifiers for the readers driving this data source. - */ - private final Collection readerIDs; - - /** - * How strict are the readers driving this data source. - */ - private final ValidationStringency validationStringency; - - /** - * Do we want to remove the program records from this data source? - */ - private final boolean removeProgramRecords; - - /** - * Store BAM indices for each reader present. - */ - private final Map bamIndices = new HashMap(); - - /** - * The merged header. - */ - private final SAMFileHeader mergedHeader; - - /** - * The constituent headers of the unmerged files. - */ - private final Map headers = new HashMap(); - - /** - * The sort order of the BAM files. Files without a sort order tag are assumed to be - * in coordinate order. - */ - private SAMFileHeader.SortOrder sortOrder = null; - - /** - * Whether the read groups in overlapping files collide. - */ - private final boolean hasReadGroupCollisions; - - /** - * Maps the SAM readers' merged read group ids to their original ids. Since merged read group ids - * are always unique, we can simply use a map here, no need to stratify by reader. - */ - private final ReadGroupMapping mergedToOriginalReadGroupMappings = new ReadGroupMapping(); - - /** - * Maps the SAM readers' original read group ids to their revised ids. This mapping must be stratified - * by readers, since there can be readgroup id collision: different bam files (readers) can list the - * same read group id, which will be disambiguated when these input streams are merged. - */ - private final Map originalToMergedReadGroupMappings = new HashMap(); - - /** - * Mapping from input file path to new sample name. Used only when doing on-the-fly sample renaming. - */ - private Map sampleRenameMap = null; - - /** our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(SAMDataSource.class); - - /** - * A collection of readers driving the merging process. - */ - private final SAMResourcePool resourcePool; - - /** - * Asynchronously loads BGZF blocks. - */ - private final BGZFBlockLoadingDispatcher dispatcher; - - /** - * How are threads allocated. - */ - private final ThreadAllocation threadAllocation; - - /** - * How are adjacent intervals merged by the sharder? - */ - private final IntervalMergingRule intervalMergingRule; - - /** - * Static set of unsupported programs that create bam files. - * The key is the PG record ID and the value is the name of the tool that created it - */ - private static Map unsupportedPGs = new HashMap<>(); - static { - unsupportedPGs.put("GATK ReduceReads", "ReduceReads"); - } - - /** - * Create a new SAM data source given the supplied read metadata. - * - * For testing purposes - * - * @param samFiles list of reads files. - */ - public SAMDataSource(Collection samFiles, ThreadAllocation threadAllocation, Integer numFileHandles, GenomeLocParser genomeLocParser) { - this( - samFiles, - threadAllocation, - numFileHandles, - genomeLocParser, - false, - ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - } - - /** - * See complete constructor. Does not enable BAQ by default. - * - * For testing purposes - */ - public SAMDataSource( - Collection samFiles, - ThreadAllocation threadAllocation, - Integer numFileHandles, - GenomeLocParser genomeLocParser, - boolean useOriginalBaseQualities, - ValidationStringency strictness, - Integer readBufferSize, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - boolean includeReadsWithDeletionAtLoci) { - this( samFiles, - threadAllocation, - numFileHandles, - genomeLocParser, - useOriginalBaseQualities, - strictness, - readBufferSize, - downsamplingMethod, - exclusionList, - supplementalFilters, - Collections.emptyList(), - includeReadsWithDeletionAtLoci, - (byte) -1, - false, - false, - null, - IntervalMergingRule.ALL); - } - - /** - * Create a new SAM data source given the supplied read metadata. - * @param samFiles list of reads files. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param strictness Stringency of reads file parsing. - * @param readBufferSize Number of reads to hold in memory per BAM. - * @param downsamplingMethod Method for downsampling reads at a given locus. - * @param exclusionList what safety checks we're willing to let slide - * @param supplementalFilters additional filters to dynamically apply. - * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method - * will explicitly list reads with deletion over the current reference base; otherwise, only observed - * bases will be seen in the pileups, and the deletions will be skipped silently. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param keepReadsInLIBS should we keep a unique list of reads in LIBS? - * @param sampleRenameMap Map of BAM file to new sample ID used during on-the-fly runtime sample renaming. - * Will be null if we're not doing sample renaming. - * @param intervalMergingRule how are adjacent intervals merged by the sharder - */ - public SAMDataSource( - Collection samFiles, - ThreadAllocation threadAllocation, - Integer numFileHandles, - GenomeLocParser genomeLocParser, - boolean useOriginalBaseQualities, - ValidationStringency strictness, - Integer readBufferSize, - DownsamplingMethod downsamplingMethod, - ValidationExclusion exclusionList, - Collection supplementalFilters, - List readTransformers, - boolean includeReadsWithDeletionAtLoci, - byte defaultBaseQualities, - boolean removeProgramRecords, - final boolean keepReadsInLIBS, - final Map sampleRenameMap, - final IntervalMergingRule intervalMergingRule) { - - this.readMetrics = new ReadMetrics(); - this.genomeLocParser = genomeLocParser; - this.intervalMergingRule = intervalMergingRule; - - readerIDs = samFiles; - - this.threadAllocation = threadAllocation; - // TODO: Consider a borrowed-thread dispatcher implementation. - if(this.threadAllocation.getNumIOThreads() > 0) { - logger.info("Running in asynchronous I/O mode; number of threads = " + this.threadAllocation.getNumIOThreads()); - dispatcher = new BGZFBlockLoadingDispatcher(this.threadAllocation.getNumIOThreads(), numFileHandles != null ? numFileHandles : 1); - } - else - dispatcher = null; - - validationStringency = strictness; - this.removeProgramRecords = removeProgramRecords; - if(readBufferSize != null) - ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests - else { - // Choose a sensible default for the read buffer size. - // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once. - // Now we are simply setting it to 100K reads - ReadShard.setReadBufferSize(100000); - } - - this.sampleRenameMap = sampleRenameMap; - - resourcePool = new SAMResourcePool(Integer.MAX_VALUE); - SAMReaders readers = resourcePool.getAvailableReaders(); - - // Determine the sort order. - for(SAMReaderID readerID: readerIDs) { - if (! readerID.samFile.canRead() ) - throw new UserException.CouldNotReadInputFile(readerID.samFile,"file is not present or user does not have appropriate permissions. " + - "Please check that the file is present and readable and try again."); - - // Get the sort order, forcing it to coordinate if unsorted. - SAMFileReader reader = readers.getReader(readerID); - SAMFileHeader header = reader.getFileHeader(); - - headers.put(readerID,header); - - if ( header.getReadGroups().isEmpty() ) { - throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, - "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); - } - - SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate; - - // Validate that all input files are sorted in the same order. - if(this.sortOrder != null && this.sortOrder != sortOrder) - throw new UserException.MissortedBAM(String.format("Attempted to process mixed of files sorted as %s and %s.",this.sortOrder,sortOrder)); - - // Update the sort order. - this.sortOrder = sortOrder; - } - - mergedHeader = readers.getMergedHeader(); - hasReadGroupCollisions = readers.hasReadGroupCollisions(); - - readProperties = new ReadProperties( - samFiles, - mergedHeader, - sortOrder, - useOriginalBaseQualities, - strictness, - downsamplingMethod, - exclusionList, - supplementalFilters, - readTransformers, - includeReadsWithDeletionAtLoci, - defaultBaseQualities, - keepReadsInLIBS); - - // cache the read group id (original) -> read group id (merged) - // and read group id (merged) -> read group id (original) mappings. - for(SAMReaderID id: readerIDs) { - SAMFileReader reader = readers.getReader(id); - - ReadGroupMapping mappingToMerged = new ReadGroupMapping(); - - List readGroups = reader.getFileHeader().getReadGroups(); - for(SAMReadGroupRecord readGroup: readGroups) { - if(hasReadGroupCollisions) { - mappingToMerged.put(readGroup.getReadGroupId(),readers.getReadGroupId(id,readGroup.getReadGroupId())); - mergedToOriginalReadGroupMappings.put(readers.getReadGroupId(id,readGroup.getReadGroupId()),readGroup.getReadGroupId()); - } else { - mappingToMerged.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); - mergedToOriginalReadGroupMappings.put(readGroup.getReadGroupId(),readGroup.getReadGroupId()); - } - } - - originalToMergedReadGroupMappings.put(id,mappingToMerged); - } - - for(SAMReaderID id: readerIDs) { - File indexFile = findIndexFile(id.samFile); - if(indexFile != null) - bamIndices.put(id,new GATKBAMIndex(indexFile)); - } - - resourcePool.releaseReaders(readers); - } - - /** - * Checks whether the provided SAM header if from a reduced bam file. - * @param header the SAM header for a given file - * @throws UserException if the header is from a reduced bam - */ - private void checkForUnsupportedBamFile(final SAMFileHeader header) { - for ( final SAMProgramRecord PGrecord : header.getProgramRecords() ) { - if ( unsupportedPGs.containsKey(PGrecord.getId()) ) - throw new UserException("The GATK no longer supports running off of BAMs produced by " + unsupportedPGs.get(PGrecord.getId())); - } - } - - public void close() { - SAMReaders readers = resourcePool.getAvailableReaders(); - for(SAMReaderID readerID: readerIDs) { - SAMFileReader reader = readers.getReader(readerID); - reader.close(); - } - } - - /** - * Returns Reads data structure containing information about the reads data sources placed in this pool as well as - * information about how they are downsampled, sorted, and filtered - * @return - */ - public ReadProperties getReadsInfo() { return readProperties; } - - /** - * Checks to see whether any reads files are supplying data. - * @return True if no reads files are supplying data to the traversal; false otherwise. - */ - public boolean isEmpty() { - return readProperties.getSAMReaderIDs().size() == 0; - } - - /** - * Gets the SAM file associated with a given reader ID. - * @param id The reader for which to retrieve the source file. - * @return the file actually associated with the id. - */ - public File getSAMFile(SAMReaderID id) { - return id.samFile; - } - - /** - * Returns readers used by this data source. - * @return A list of SAM reader IDs. - */ - public Collection getReaderIDs() { - return readerIDs; - } - - /** - * Retrieves the id of the reader which built the given read. - * @param read The read to test. - * @return ID of the reader. - */ - public SAMReaderID getReaderID(SAMRecord read) { - return resourcePool.getReaderID(read.getFileSource().getReader()); - } - - /** - * Gets the merged header from the SAM file. - * @return The merged header. - */ - public SAMFileHeader getHeader() { - return mergedHeader; - } - - public SAMFileHeader getHeader(SAMReaderID id) { - return headers.get(id); - } - - /** - * Gets the revised read group id mapped to this 'original' read group id. - * @param reader for which to grab a read group. - * @param originalReadGroupId ID of the original read group. - * @return Merged read group ID. - */ - public String getReadGroupId(final SAMReaderID reader, final String originalReadGroupId) { - return originalToMergedReadGroupMappings.get(reader).get(originalReadGroupId); - } - - /** - * Gets the original read group id (as it was specified in the original input bam file) that maps onto - * this 'merged' read group id. - * @param mergedReadGroupId 'merged' ID of the read group (as it is presented by the read received from merged input stream). - * @return Merged read group ID. - */ - public String getOriginalReadGroupId(final String mergedReadGroupId) { - return mergedToOriginalReadGroupMappings.get(mergedReadGroupId); - } - - /** - * True if all readers have an index. - * @return True if all readers have an index. - */ - public boolean hasIndex() { - return readerIDs.size() == bamIndices.size(); - } - - /** - * Gets the index for a particular reader. Always preloaded. - * @param id Id of the reader. - * @return The index. Will preload the index if necessary. - */ - public GATKBAMIndex getIndex(final SAMReaderID id) { - return bamIndices.get(id); - } - - /** - * Retrieves the sort order of the readers. - * @return Sort order. Can be unsorted, coordinate order, or query name order. - */ - public SAMFileHeader.SortOrder getSortOrder() { - return sortOrder; - } - - /** - * Gets the cumulative read metrics for shards already processed. - * @return Cumulative read metrics. - */ - public ReadMetrics getCumulativeReadMetrics() { - // don't return a clone here because the engine uses a pointer to this object - return readMetrics; - } - - /** - * Incorporate the given read metrics into the cumulative read metrics. - * @param readMetrics The 'incremental' read metrics, to be incorporated into the cumulative metrics. - */ - public void incorporateReadMetrics(final ReadMetrics readMetrics) { - this.readMetrics.incrementMetrics(readMetrics); - } - - public GATKSAMIterator seek(Shard shard) { - if(shard.buffersReads()) { - return shard.iterator(); - } - else { - return getIterator(shard); - } - } - - /** - * Gets the reader associated with the given read. - * @param readers Available readers. - * @param read - * @return - */ - private SAMReaderID getReaderID(SAMReaders readers, SAMRecord read) { - for(SAMReaderID id: getReaderIDs()) { - if(readers.getReader(id) == read.getFileSource().getReader()) - return id; - } - throw new ReviewedGATKException("Unable to find id for reader associated with read " + read.getReadName()); - } - - /** - * Get the initial reader positions across all BAM files - * - * @return the start positions of the first chunk of reads for all BAM files - */ - protected Map getInitialReaderPositions() { - Map initialPositions = new HashMap(); - SAMReaders readers = resourcePool.getAvailableReaders(); - - for ( SAMReaderID id: getReaderIDs() ) { - initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); - } - - resourcePool.releaseReaders(readers); - return initialPositions; - } - - /** - * Get an iterator over the data types specified in the shard. - * - * @param shard The shard specifying the data limits. - * @return An iterator over the selected data. - */ - protected GATKSAMIterator getIterator( Shard shard ) { - return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); - } - - /** - * Get an iterator over the data types specified in the shard. - * @param readers Readers from which to load data. - * @param shard The shard specifying the data limits. - * @param enableVerification True to verify. For compatibility with old sharding strategy. - * @return An iterator over the selected data. - */ - private GATKSAMIterator getIterator(SAMReaders readers, Shard shard, boolean enableVerification) { - // Set up merging to dynamically merge together multiple BAMs. - Map> iteratorMap = new HashMap>(); - - for(SAMReaderID id: getReaderIDs()) { - CloseableIterator iterator = null; - - // TODO: null used to be the signal for unmapped, but we've replaced that with a simple index query for the last bin. - // TODO: Kill this check once we've proven that the design elements are gone. - if(shard.getFileSpans().get(id) == null) - throw new ReviewedGATKException("SAMDataSource: received null location for reader " + id + ", but null locations are no longer supported."); - - try { - if(threadAllocation.getNumIOThreads() > 0) { - BlockInputStream inputStream = readers.getInputStream(id); - inputStream.submitAccessPlan(new BAMAccessPlan(id, inputStream, (GATKBAMFileSpan) shard.getFileSpans().get(id))); - BAMRecordCodec codec = new BAMRecordCodec(getHeader(id),factory); - codec.setInputStream(inputStream); - iterator = new BAMCodecIterator(inputStream,readers.getReader(id),codec); - } - else { - iterator = readers.getReader(id).iterator(shard.getFileSpans().get(id)); - } - } catch ( RuntimeException e ) { // we need to catch RuntimeExceptions here because the Picard code is throwing them (among SAMFormatExceptions) sometimes - throw new UserException.MalformedBAM(id.samFile, e.getMessage()); - } - - iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); - if(shard.getGenomeLocs().size() > 0) - iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); - - iteratorMap.put(readers.getReader(id), iterator); - } - - MergingSamRecordIterator mergingIterator = readers.createMergingIterator(iteratorMap); - - // The readMetrics object being passed in should be that of this dataSource and NOT the shard: the dataSource's - // metrics is intended to keep track of the reads seen (and hence passed to the CountingFilteringIterator when - // we apply the decorators), whereas the shard's metrics is used to keep track the "records" seen. - return applyDecoratingIterators(readMetrics, - enableVerification, - readProperties.useOriginalBaseQualities(), - new ReleasingIterator(readers,GATKSAMIteratorAdapter.adapt(mergingIterator)), - readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), - readProperties.getSupplementalFilters(), - readProperties.getReadTransformers(), - readProperties.defaultBaseQualities(), - shard instanceof LocusShard); - } - - private class BAMCodecIterator implements CloseableIterator { - private final BlockInputStream inputStream; - private final SAMFileReader reader; - private final BAMRecordCodec codec; - private SAMRecord nextRead; - - private BAMCodecIterator(final BlockInputStream inputStream, final SAMFileReader reader, final BAMRecordCodec codec) { - this.inputStream = inputStream; - this.reader = reader; - this.codec = codec; - advance(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if(!hasNext()) - throw new NoSuchElementException("Unable to retrieve next record from BAMCodecIterator; input stream is empty"); - SAMRecord currentRead = nextRead; - advance(); - return currentRead; - } - - public void close() { - // NO-OP. - } - - public void remove() { - throw new UnsupportedOperationException("Unable to remove from BAMCodecIterator"); - } - - private void advance() { - final long startCoordinate = inputStream.getFilePointer(); - nextRead = codec.decode(); - final long stopCoordinate = inputStream.getFilePointer(); - - if(reader != null && nextRead != null) - PicardNamespaceUtils.setFileSource(nextRead, new SAMFileSource(reader, new GATKBAMFileSpan(new GATKChunk(startCoordinate, stopCoordinate)))); - } - } - - /** - * Filter reads based on user-specified criteria. - * - * @param readMetrics metrics to track when using this iterator. - * @param enableVerification Verify the order of reads. - * @param useOriginalBaseQualities True if original base qualities should be used. - * @param wrappedIterator the raw data source. - * @param noValidationOfReadOrder Another trigger for the verifying iterator? TODO: look into this. - * @param supplementalFilters additional filters to apply to the reads. - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - * @param isLocusBasedTraversal true if we're dealing with a read stream from a LocusShard - * @return An iterator wrapped with filters reflecting the passed-in parameters. Will not be null. - */ - protected GATKSAMIterator applyDecoratingIterators(ReadMetrics readMetrics, - boolean enableVerification, - boolean useOriginalBaseQualities, - GATKSAMIterator wrappedIterator, - Boolean noValidationOfReadOrder, - Collection supplementalFilters, - List readTransformers, - byte defaultBaseQualities, - boolean isLocusBasedTraversal ) { - - // Always apply the ReadFormattingIterator before both ReadFilters and ReadTransformers. At a minimum, - // this will consolidate the cigar strings into canonical form. This has to be done before the read - // filtering, because not all read filters will behave correctly with things like zero-length cigar - // elements. If useOriginalBaseQualities is true or defaultBaseQualities >= 0, this iterator will also - // modify the base qualities. - wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); - - // Read Filters: these are applied BEFORE downsampling, so that we downsample within the set of reads - // that actually survive filtering. Otherwise we could get much less coverage than requested. - wrappedIterator = GATKSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); - - // Downsampling: - - // For locus traversals where we're downsampling to coverage by sample, assume that the downsamplers - // will be invoked downstream from us in LocusIteratorByState. This improves performance by avoiding - // splitting/re-assembly of the read stream at this stage, and also allows for partial downsampling - // of individual reads. - boolean assumeDownstreamLIBSDownsampling = isLocusBasedTraversal && - readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readProperties.getDownsamplingMethod().toCoverage != null; - - // Apply downsampling iterators here only in cases where we know that LocusIteratorByState won't be - // doing any downsampling downstream of us - if ( ! assumeDownstreamLIBSDownsampling ) { - wrappedIterator = applyDownsamplingIterator(wrappedIterator); - } - - // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, - // verify the read ordering by applying a sort order iterator - if (!noValidationOfReadOrder && enableVerification) - wrappedIterator = new VerifyingSamIterator(wrappedIterator); - - // Read transformers: these are applied last, so that we don't bother transforming reads that get discarded - // by the read filters or downsampler. - for ( final ReadTransformer readTransformer : readTransformers ) { - if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT ) - wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer); - } - - return wrappedIterator; - } - - protected GATKSAMIterator applyDownsamplingIterator( GATKSAMIterator wrappedIterator ) { - if ( readProperties.getDownsamplingMethod() == null || - readProperties.getDownsamplingMethod().type == DownsampleType.NONE ) { - return wrappedIterator; - } - - if ( readProperties.getDownsamplingMethod().toFraction != null ) { - - // If we're downsampling to a fraction of reads, there's no point in paying the cost of - // splitting/re-assembling the read stream by sample to run the FractionalDownsampler on - // reads from each sample separately, since the result would be the same as running the - // FractionalDownsampler on the entire stream. So, ALWAYS use the DownsamplingReadsIterator - // rather than the PerSampleDownsamplingReadsIterator, even if BY_SAMPLE downsampling - // was requested. - - return new DownsamplingReadsIterator(wrappedIterator, - new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction)); - } - else if ( readProperties.getDownsamplingMethod().toCoverage != null ) { - - // If we're downsampling to coverage, we DO need to pay the cost of splitting/re-assembling - // the read stream to run the downsampler on the reads for each individual sample separately if - // BY_SAMPLE downsampling was requested. - - if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { - return new PerSampleDownsamplingReadsIterator(wrappedIterator, - new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage)); - } - else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { - return new DownsamplingReadsIterator(wrappedIterator, - new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage)); - } - } - - return wrappedIterator; - } - - - private class SAMResourcePool { - /** - * How many entries can be cached in this resource pool? - */ - private final int maxEntries; - - /** - * All iterators of this reference-ordered data. - */ - private List allResources = new ArrayList(); - - /** - * All iterators that are not currently in service. - */ - private List availableResources = new ArrayList(); - - public SAMResourcePool(final int maxEntries) { - this.maxEntries = maxEntries; - } - - /** - * Choose a set of readers from the pool to use for this query. When complete, - * @return - */ - public synchronized SAMReaders getAvailableReaders() { - if(availableResources.size() == 0) - createNewResource(); - SAMReaders readers = availableResources.get(0); - availableResources.remove(readers); - return readers; - } - - public synchronized void releaseReaders(SAMReaders readers) { - if(!allResources.contains(readers)) - throw new ReviewedGATKException("Tried to return readers from the pool that didn't originate in the pool."); - availableResources.add(readers); - } - - /** - * Gets the reader id for the given reader. - * @param reader Reader for which to determine the id. - * @return id of the given reader. - */ - protected synchronized SAMReaderID getReaderID(SamReader reader) { - for(SAMReaders readers: allResources) { - SAMReaderID id = readers.getReaderID(reader); - if(id != null) - return id; - } - throw new ReviewedGATKException("No such reader id is available"); - } - - private synchronized void createNewResource() { - if(allResources.size() > maxEntries) - throw new ReviewedGATKException("Cannot create a new resource pool. All resources are in use."); - SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords); - allResources.add(readers); - availableResources.add(readers); - } - - } - - /** - * A collection of readers derived from a reads metadata structure. - */ - private class SAMReaders implements Iterable { - /** - * Cached representation of the merged header used to generate a merging iterator. - */ - private final SamFileHeaderMerger headerMerger; - - /** - * Internal storage for a map of id -> reader. - */ - private final Map readers = new LinkedHashMap(); - - /** - * The inptu streams backing - */ - private final Map inputStreams = new LinkedHashMap(); - - /** - * Derive a new set of readers from the Reads metadata. - * @param readerIDs reads to load. - * TODO: validationStringency is not used here - * @param validationStringency validation stringency. - * @param removeProgramRecords indicate whether to clear program records from the readers - */ - public SAMReaders(Collection readerIDs, ValidationStringency validationStringency, boolean removeProgramRecords) { - final int totalNumberOfFiles = readerIDs.size(); - int readerNumber = 1; - final SimpleTimer timer = new SimpleTimer().start(); - - if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords in serial"); - final int tickSize = 50; - int nExecutedTotal = 0; - long lastTick = timer.currentTime(); - for(final SAMReaderID readerID: readerIDs) { - final ReaderInitializer init = new ReaderInitializer(readerID).call(); - - checkForUnsupportedBamFile(init.reader.getFileHeader()); - - if (removeProgramRecords) { - init.reader.getFileHeader().setProgramRecords(new ArrayList()); - } - - if (threadAllocation.getNumIOThreads() > 0) { - inputStreams.put(init.readerID, init.blockInputStream); // get from initializer - } - - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); - readers.put(init.readerID,init.reader); - if ( ++nExecutedTotal % tickSize == 0) { - double tickInSec = (timer.currentTime() - lastTick) / 1000.0; - printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); - lastTick = timer.currentTime(); - } - } - - if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); - - Collection headers = new LinkedList(); - - // Examine the bam headers, perform any requested sample renaming on them, and add - // them to the list of headers to pass to the Picard SamFileHeaderMerger: - for ( final Map.Entry readerEntry : readers.entrySet() ) { - final SAMReaderID readerID = readerEntry.getKey(); - final SAMFileReader reader = readerEntry.getValue(); - final SAMFileHeader header = reader.getFileHeader(); - - // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, - // or the user's sample rename map file didn't contain an entry for this bam file: - final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(readerID.getSamFilePath()) : null; - - // If we've been asked to rename the sample for this bam file, do so now. We'll check to - // make sure this bam only contains reads from one sample before proceeding. - // - // IMPORTANT: relies on the fact that the Picard SamFileHeaderMerger makes a copy of - // the existing read group attributes (including sample name) when merging - // headers, regardless of whether there are read group collisions or not. - if ( remappedSampleName != null ) { - remapSampleName(readerID, header, remappedSampleName); - } - - headers.add(header); - } - - headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); - - // update all read groups to GATKSAMRecordReadGroups - final List gatkReadGroups = new LinkedList(); - for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) { - gatkReadGroups.add(new GATKSAMReadGroupRecord(rg)); - } - headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); - } - - /** - * Changes the sample name in the read groups for the provided bam file header to match the - * remappedSampleName. Blows up with a UserException if the header contains more than one - * sample name. - * - * @param readerID ID for the bam file from which the provided header came from - * @param header The bam file header. Will be modified by this call. - * @param remappedSampleName New sample name to replace the existing sample attribute in the - * read groups for the header. - */ - private void remapSampleName( final SAMReaderID readerID, final SAMFileHeader header, final String remappedSampleName ) { - String firstEncounteredSample = null; - - for ( final SAMReadGroupRecord readGroup : header.getReadGroups() ) { - final String thisReadGroupSample = readGroup.getSample(); - - if ( thisReadGroupSample == null ) { - throw new UserException(String.format("On-the fly sample renaming was requested for bam file %s, however this " + - "bam file contains a read group (id: %s) with a null sample attribute", - readerID.getSamFilePath(), readGroup.getId())); - } - else if ( firstEncounteredSample == null ) { - firstEncounteredSample = thisReadGroupSample; - } - else if ( ! firstEncounteredSample.equals(thisReadGroupSample) ) { - throw new UserException(String.format("On-the-fly sample renaming was requested for bam file %s, " + - "however this bam file contains reads from more than one sample " + - "(encountered samples %s and %s in the bam header). The GATK requires that " + - "all bams for which on-the-fly sample renaming is requested " + - "contain reads from only a single sample per bam.", - readerID.getSamFilePath(), firstEncounteredSample, thisReadGroupSample)); - } - - readGroup.setSample(remappedSampleName); - } - } - - final private void printReaderPerformance(final int nExecutedTotal, - final int nExecutedInTick, - final int totalNumberOfFiles, - final SimpleTimer timer, - final double tickDurationInSec) { - final int pendingSize = totalNumberOfFiles - nExecutedTotal; - final double totalTimeInSeconds = timer.getElapsedTime(); - final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); - final int nRemaining = pendingSize; - final double estTimeToComplete = pendingSize / nTasksPerSecond; - logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", - nExecutedInTick, tickDurationInSec, - nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, - nRemaining, estTimeToComplete, estTimeToComplete / 60)); - } - - /** - * Return the header derived from the merging of these BAM files. - * @return the merged header. - */ - public SAMFileHeader getMergedHeader() { - return headerMerger.getMergedHeader(); - } - - /** - * Do multiple read groups collide in this dataset? - * @return True if multiple read groups collide; false otherwis. - */ - public boolean hasReadGroupCollisions() { - return headerMerger.hasReadGroupCollisions(); - } - - /** - * Get the newly mapped read group ID for the given read group. - * @param readerID Reader for which to discern the transformed ID. - * @param originalReadGroupID Original read group. - * @return Remapped read group. - */ - public String getReadGroupId(final SAMReaderID readerID, final String originalReadGroupID) { - SAMFileHeader header = readers.get(readerID).getFileHeader(); - return headerMerger.getReadGroupId(header,originalReadGroupID); - } - - /** - * Creates a new merging iterator from the given map, with the given header. - * @param iteratorMap A map of readers to iterators. - * @return An iterator which will merge those individual iterators. - */ - public MergingSamRecordIterator createMergingIterator(final Map> iteratorMap) { - return new MergingSamRecordIterator(headerMerger,iteratorMap,true); - } - - /** - * Retrieve the reader from the data structure. - * @param id The ID of the reader to retrieve. - * @return the reader associated with the given id. - */ - public SAMFileReader getReader(SAMReaderID id) { - if(!readers.containsKey(id)) - throw new NoSuchElementException("No reader is associated with id " + id); - return readers.get(id); - } - - /** - * Retrieve the input stream backing a reader. - * @param id The ID of the reader to retrieve. - * @return the reader associated with the given id. - */ - public BlockInputStream getInputStream(final SAMReaderID id) { - return inputStreams.get(id); - } - - /** - * Searches for the reader id of this reader. - * @param reader Reader for which to search. - * @return The id associated the given reader, or null if the reader is not present in this collection. - */ - protected SAMReaderID getReaderID(SamReader reader) { - for(Map.Entry entry: readers.entrySet()) { - if(reader == entry.getValue()) - return entry.getKey(); - } - // Not found? return null. - return null; - } - - /** - * Returns an iterator over all readers in this structure. - * @return An iterator over readers. - */ - public Iterator iterator() { - return readers.values().iterator(); - } - - /** - * Returns whether any readers are present in this structure. - * @return - */ - public boolean isEmpty() { - return readers.isEmpty(); - } - } - - class ReaderInitializer implements Callable { - final SAMReaderID readerID; - BlockInputStream blockInputStream = null; - SAMFileReader reader; - - public ReaderInitializer(final SAMReaderID readerID) { - this.readerID = readerID; - } - - public ReaderInitializer call() { - final File indexFile = findIndexFile(readerID.samFile); - try { - if (threadAllocation.getNumIOThreads() > 0) - blockInputStream = new BlockInputStream(dispatcher,readerID,false); - reader = new SAMFileReader(readerID.samFile,indexFile,false); - } catch ( RuntimeIOException e ) { - throw new UserException.CouldNotReadInputFile(readerID.samFile, e); - } catch ( SAMFormatException e ) { - throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); - } - // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files). - // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case, - // just in case we want to change this behavior later. - catch ( RuntimeException e ) { - throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); - } - reader.setSAMRecordFactory(factory); - reader.enableFileSource(true); - reader.setValidationStringency(validationStringency); - return this; - } - } - - private class ReleasingIterator implements GATKSAMIterator { - /** - * The resource acting as the source of the data. - */ - private final SAMReaders resource; - - /** - * The iterator to wrap. - */ - private final GATKSAMIterator wrappedIterator; - - public ReleasingIterator(SAMReaders resource, GATKSAMIterator wrapped) { - this.resource = resource; - this.wrappedIterator = wrapped; - } - - public ReleasingIterator iterator() { - return this; - } - - public void remove() { - throw new UnsupportedOperationException("Can't remove from a GATKSAMIterator"); - } - - public void close() { - wrappedIterator.close(); - resourcePool.releaseReaders(resource); - } - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public SAMRecord next() { - return wrappedIterator.next(); - } - } - - /** - * Maps read groups in the original SAMFileReaders to read groups in - */ - private class ReadGroupMapping extends HashMap {} - - /** - * Locates the index file alongside the given BAM, if present. - * @param bamFile The data file to use. - * @return A File object if the index file is present; null otherwise. - */ - private File findIndexFile(File bamFile) { - return SamFiles.findIndex(bamFile); - } - - /** - * Creates a BAM schedule over all reads in the BAM file, both mapped and unmapped. The outgoing stream - * will be as granular as possible given our current knowledge of the best ways to split up BAM files. - * @return An iterator that spans all reads in all BAM files. - */ - public Iterable createShardIteratorOverAllReads(final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverAllReads(this,genomeLocParser),genomeLocParser); - return shardBalancer; - } - - /** - * Creates a BAM schedule over all mapped reads in the BAM file, when a 'mapped' read is defined as any - * read that has been assigned - * - * @param shardBalancer shard balancer object - * @return non-null initialized version of the shard balancer - */ - public Iterable createShardIteratorOverMappedReads(final ShardBalancer shardBalancer) { - shardBalancer.initialize(this,IntervalSharder.shardOverMappedReads(this,genomeLocParser),genomeLocParser); - return shardBalancer; - } - - /** - * Create a schedule for processing the initialized BAM file using the given interval list. - * The returned schedule should be as granular as possible. - * @param intervals The list of intervals for which to create the schedule. - * @return A granular iterator over file pointers. - */ - public Iterable createShardIteratorOverIntervals(final GenomeLocSortedSet intervals,final ShardBalancer shardBalancer) { - if(intervals == null) - throw new ReviewedGATKException("Unable to create schedule from intervals; no intervals were provided."); - shardBalancer.initialize(this,IntervalSharder.shardOverIntervals(SAMDataSource.this,intervals,intervalMergingRule),genomeLocParser); - return shardBalancer; - } -} - - - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderID.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderID.java deleted file mode 100644 index ef5aaa040..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderID.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import org.broadinstitute.gatk.utils.commandline.Tags; - -import java.io.File; - -/** - * Uniquely identifies a SAM file reader. - * - * @author mhanna - * @version 0.1 - */ -public class SAMReaderID implements Comparable { - /** - * The SAM file at the heart of this reader. SAMReaderID - * currently supports only file-based readers. - */ - protected final File samFile; - - /** - * A list of tags associated with this BAM file. - */ - protected final Tags tags; - - /** - * Creates an identifier for a SAM file based on read. - * @param samFile The source file for SAM data. - * @param tags tags to use when creating a reader ID. - */ - public SAMReaderID(File samFile, Tags tags) { - this.samFile = samFile; - this.tags = tags; - } - - /** - * Creates an identifier for a SAM file based on read. - * @param samFileName The source filename for SAM data. - * @param tags tags to use when creating a reader ID. - */ - public SAMReaderID(String samFileName, Tags tags) { - this(new File(samFileName),tags); - } - - /** - * Gets the absolute pathname of this SAM file - * @return The absolute pathname of this reader's SAM file, - * or null if this reader has no associated SAM file - */ - public String getSamFilePath() { - if ( samFile == null ) { - return null; - } - - return samFile.getAbsolutePath(); - } - - /** - * Gets the tags associated with the given BAM file. - * @return A collection of the tags associated with this file. - */ - public Tags getTags() { - return tags; - } - - /** - * Compare two IDs to see whether they're equal. - * @param other The other identifier. - * @return True iff the two readers point to the same file. - */ - @Override - public boolean equals(Object other) { - if(other == null) return false; - if(!(other instanceof SAMReaderID)) return false; - - SAMReaderID otherID = (SAMReaderID)other; - return this.getSamFilePath().equals(otherID.getSamFilePath()); - } - - /** - * Generate a hash code for this object. - * @return A hash code, based solely on the file name at this point. - */ - @Override - public int hashCode() { - return samFile.getAbsolutePath().hashCode(); - } - - /** - * Best string representation for a SAM file reader is the path of the source file. - */ - @Override - public String toString() { - return getSamFilePath(); - } - - @Override - public int compareTo(Object other) { - return this.samFile.getAbsolutePath().compareTo(((SAMReaderID)other).samFile.getAbsolutePath()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java deleted file mode 100644 index cc8944ce3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/Shard.java +++ /dev/null @@ -1,253 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.SAMFileSpan; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.ReadMetrics; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -/** - * - * User: aaron - * Date: Apr 10, 2009 - * Time: 5:00:27 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date Apr 10, 2009 - *

- * Interface Shard - *

- * The base abstract class for shards. - */ -public abstract class Shard implements HasGenomeLocation { - public enum ShardType { - READ, LOCUS - } - - protected final GenomeLocParser parser; // incredibly annoying! - - /** - * What type of shard is this? Read or locus? - */ - protected final ShardType shardType; - - /** - * Locations. - */ - protected final List locs; - - /** - * Whether the current location is unmapped. - */ - private final boolean isUnmapped; - - /** - * Reads data, if applicable. - */ - private final SAMDataSource readsDataSource; - - /** - * The data backing the next chunks to deliver to the traversal engine. - */ - private final Map fileSpans; - - /** - * Lazy-calculated span of all of the genome locs in this shard - */ - private GenomeLoc spanningLocation = null; - - /** - * Statistics about which reads in this shards were used and which were filtered away. - */ - protected final ReadMetrics readMetrics = new ReadMetrics(); - - /** - * Whether this shard points to an unmapped region. - * Some shard types conceptually be unmapped (e.g. LocusShards). In - * this case, isUnmapped should always return false. - * @return True if this shard is unmapped. False otherwise. - */ - public boolean isUnmapped() { - return isUnmapped; - } - - public Shard(GenomeLocParser parser, - ShardType shardType, - List locs, - SAMDataSource readsDataSource, - Map fileSpans, - boolean isUnmapped) { - this.locs = locs; - this.parser = parser; - this.shardType = shardType; - this.readsDataSource = readsDataSource; - this.fileSpans = fileSpans; - this.isUnmapped = isUnmapped; - } - - /** - * If isUnmapped is true, than getGenomeLocs by - * definition will return a singleton list with a GenomeLoc.UNMAPPED - * - * Can return null, indicating that the entire genome is covered. - * - * @return the genome location represented by this shard - */ - public List getGenomeLocs() { - return locs; - } - - /** - * Get the list of chunks delimiting this shard. - * @return a list of chunks that contain data for this shard. - */ - public Map getFileSpans() { - return Collections.unmodifiableMap(fileSpans); - } - - /** - * Returns the span of the genomeLocs comprising this shard - * @return a GenomeLoc that starts as the first position in getGenomeLocs() and stops at the stop of the last - * position in getGenomeLocs() - */ - public GenomeLoc getLocation() { - if ( spanningLocation == null ) { - if ( getGenomeLocs() == null ) - spanningLocation = GenomeLoc.WHOLE_GENOME; - else if ( getGenomeLocs().size() == 0 ) { - spanningLocation = getGenomeLocs().get(0); - } else { - int start = Integer.MAX_VALUE; - int stop = Integer.MIN_VALUE; - String contig = null; - - for ( GenomeLoc loc : getGenomeLocs() ) { - if ( GenomeLoc.isUnmapped(loc) ) - // special case the unmapped region marker, just abort out - return loc; - contig = loc.getContig(); - if ( loc.getStart() < start ) start = loc.getStart(); - if ( loc.getStop() > stop ) stop = loc.getStop(); - } - - spanningLocation = parser.createGenomeLoc(contig, start, stop); - } - } - - return spanningLocation; - } - - - /** - * what kind of shard do we return - * @return ShardType, indicating the type - */ - public ShardType getShardType() { - return shardType; - } - - /** - * Does any releasing / aggregation required when the shard is through being processed. - */ - public void close() { - readsDataSource.incorporateReadMetrics(readMetrics); - } - - /** - * Gets key read validation and filtering properties. - * @return set of read properties associated with this shard. - */ - public ReadProperties getReadProperties() { - return readsDataSource.getReadsInfo(); - } - - /** - * Gets the runtime metrics associated with this shard. - * Retrieves a storage space of metrics about number of reads included, filtered, etc. - * @return Storage space for metrics. - */ - public ReadMetrics getReadMetrics() { - return readMetrics; - } - - /** - * Returns true if this shard is meant to buffer reads, rather - * than just holding pointers to their locations. - * @return True if this shard can buffer reads. False otherwise. - */ - public boolean buffersReads() { return false; } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferEmpty() { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Returns true if the read buffer is currently full. - * @return True if this shard's buffer is full (and the shard can buffer reads). - */ - public boolean isBufferFull() { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Adds a read to the read buffer. - * @param read Add a read to the internal shard buffer. - */ - public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Fills the shard with reads. Can only do this with shards that buffer reads - * @param readIter Iterator from which to draw the reads to fill the shard - */ - public void fill( PeekableIterator readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); } - - /** - * Gets the iterator over the elements cached in the shard. - * @return - */ - public GATKSAMIterator iterator() { throw new UnsupportedOperationException("This shard does not buffer reads."); } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java deleted file mode 100644 index 9105b4cf8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reads/utilities/FindLargeShards.java +++ /dev/null @@ -1,192 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads.utilities; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; -import org.broadinstitute.gatk.utils.commandline.Input; -import org.broadinstitute.gatk.utils.commandline.Output; -import org.broadinstitute.gatk.engine.datasources.reads.FilePointer; -import org.broadinstitute.gatk.engine.datasources.reads.IntervalSharder; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; -import org.broadinstitute.gatk.utils.text.ListFileUtils; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.math.BigInteger; -import java.util.ArrayList; -import java.util.List; - -/** - * Traverses a region in a dataset looking for outliers. - */ -public class FindLargeShards extends CommandLineProgram { - private static Logger logger = Logger.getLogger(FindLargeShards.class); - - @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) - public List samFiles = new ArrayList(); - - @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) - public File referenceFile = null; - - @Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.",required=false) - public List intervals = null; - - @Output(required=false) - public PrintStream out = System.out; - - /** - * The square of the sum of all uncompressed data. Based on the BAM spec, the size of this could be - * up to (2^64)^2. - */ - private BigInteger sumOfSquares = BigInteger.valueOf(0); - - /** - * The running sum of all uncompressed data. Based on the BAM spec, the BAM must be less than Long.MAX_LONG - * when compressed -- in other words, the sum of the sizes of all BGZF blocks must be < 2^64. - */ - private BigInteger sum = BigInteger.valueOf(0); - - /** - * The number of shards viewed. - */ - private long numberOfShards; - - - @Override - public int execute() throws IOException { - // initialize reference - IndexedFastaSequenceFile refReader = new IndexedFastaSequenceFile(referenceFile); - GenomeLocParser genomeLocParser = new GenomeLocParser(refReader); - - // initialize reads - List bamReaders = ListFileUtils.unpackBAMFileList(samFiles,parser); - SAMDataSource dataSource = new SAMDataSource(bamReaders,new ThreadAllocation(),null,genomeLocParser); - - // intervals - final GenomeLocSortedSet intervalSortedSet; - if ( intervals != null ) - intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); - else - intervalSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(refReader.getSequenceDictionary()); - - logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize")); - - IntervalSharder sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); - while(sharder.hasNext()) { - FilePointer filePointer = sharder.next(); - - // Size of the file pointer. - final long size = filePointer.size(); - - BigInteger bigSize = BigInteger.valueOf(size); - sumOfSquares = sumOfSquares.add(bigSize.pow(2)); - sum = sum.add(bigSize); - numberOfShards++; - - if(numberOfShards % 1000 == 0) { - GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); - logger.info(String.format("PROGRESS: Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); - } - - } - - // Print out the stddev: (sum(x^2) - (1/N)*sum(x)^2)/N - long mean = sum.divide(BigInteger.valueOf(numberOfShards)).longValue(); - long stddev = (long)(Math.sqrt(sumOfSquares.subtract(sum.pow(2).divide(BigInteger.valueOf(numberOfShards))).divide(BigInteger.valueOf(numberOfShards)).doubleValue())); - logger.info(String.format("Number of shards: %d; mean uncompressed size = %d; stddev uncompressed size = %d%n",numberOfShards,mean,stddev)); - - // Crank through the shards again, this time reporting on the shards significantly larger than the mean. - long threshold = mean + stddev*5; - logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize")); - out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n"); - - sharder = IntervalSharder.shardOverIntervals(dataSource,intervalSortedSet,IntervalMergingRule.ALL); - while(sharder.hasNext()) { - FilePointer filePointer = sharder.next(); - - // Bounding region. - GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser); - - // Size of the file pointer. - final long size = filePointer.size(); - - numberOfShards++; - - if(filePointer.size() <= threshold) { - if(numberOfShards % 1000 == 0) - logger.info(String.format("PROGRESS: Searching for large shards: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size)); - continue; - } - - out.printf("%s\t%d\t%d\t%d%n",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size); - } - - return 0; - } - - private GenomeLoc getBoundingRegion(final FilePointer filePointer, final GenomeLocParser genomeLocParser) { - List regions = filePointer.getLocations(); - - // The region contained by this FilePointer. - final String contig = regions.get(0).getContig(); - final int start = regions.get(0).getStart(); - final int stop = regions.get(regions.size()-1).getStop(); - - return genomeLocParser.createGenomeLoc(contig,start,stop); - } - - /** - * Required main method implementation. - * @param argv Command-line argument text. - * @throws Exception on error. - */ - public static void main(String[] argv) throws Exception { - int returnCode = 0; - try { - FindLargeShards instance = new FindLargeShards(); - start(instance, argv); - returnCode = 0; - } - catch(Exception ex) { - returnCode = 1; - ex.printStackTrace(); - throw ex; - } - finally { - System.exit(returnCode); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java deleted file mode 100644 index 6fdbea3a0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSource.java +++ /dev/null @@ -1,199 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reference; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; - -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Loads reference data from fasta file - * Looks for fai and dict files, and tries to create them if they don't exist - */ -public class ReferenceDataSource { - private IndexedFastaSequenceFile reference; - - /** our log, which we want to capture anything from this class */ - protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); - - /** - * Create reference data source from fasta file - * @param fastaFile Fasta file to be used as reference - */ - public ReferenceDataSource(File fastaFile) { - // does the fasta file exist? check that first... - if (!fastaFile.exists()) - throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist."); - - final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz"); - if ( isGzipped ) { - throw new UserException.CannotHandleGzippedRef(); - } - - final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); - - // determine the name for the dict file - final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? "\\.fa$" : "\\.fasta$"; - final File dictFile = new File(fastaFile.getAbsolutePath().replaceAll(fastaExt, ".dict")); - - // It's an error if either the fai or dict file does not exist. The user is now responsible - // for creating these files. - if (!indexFile.exists()) { - throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile); - } - if (!dictFile.exists()) { - throw new UserException.MissingReferenceDictFile(dictFile, fastaFile); - } - - // Read reference data by creating an IndexedFastaSequenceFile. - try { - reference = new CachingIndexedFastaSequenceFile(fastaFile); - } - catch (IllegalArgumentException e) { - throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); - } - catch (Exception e) { - throw new UserException.CouldNotReadInputFile(fastaFile, e); - } - } - - /** - * Get indexed fasta file - * @return IndexedFastaSequenceFile that was created from file - */ - public IndexedFastaSequenceFile getReference() { - return this.reference; - } - - /** - * Creates an iterator for processing the entire reference. - * @param readsDataSource the reads datasource to embed in the locus shard. - * @param parser used to generate/regenerate intervals. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. - * @param maxShardSize The maximum shard size which can be used to create this list. - * @return Creates a schedule for performing a traversal over the entire reference. - */ - public Iterable createShardsOverEntireReference(final SAMDataSource readsDataSource, final GenomeLocParser parser, final int maxShardSize) { - List shards = new ArrayList(); - for(SAMSequenceRecord refSequenceRecord: reference.getSequenceDictionary().getSequences()) { - for(int shardStart = 1; shardStart <= refSequenceRecord.getSequenceLength(); shardStart += maxShardSize) { - final int shardStop = Math.min(shardStart+maxShardSize-1, refSequenceRecord.getSequenceLength()); - shards.add(new LocusShard(parser, - readsDataSource, - Collections.singletonList(parser.createGenomeLoc(refSequenceRecord.getSequenceName(),shardStart,shardStop)), - null)); - } - } - return shards; - } - - - public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { - List shards = new ArrayList(); - - for(GenomeLoc interval: intervals) { - while(interval.size() > maxShardSize) { - shards.add(new LocusShard(intervals.getGenomeLocParser(), - readsDataSource, - Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), - null)); - interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); - } - shards.add(new LocusShard(intervals.getGenomeLocParser(), - readsDataSource, - Collections.singletonList(interval), - null)); - } - - return shards; - } - - - /** - * Creates an iterator for processing the entire reference. - * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. - * @param intervals the list of intervals to use when processing the reference. - * @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size. - * @return Creates a schedule for performing a traversal over the entire reference. - */ -/* - public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) { - final List shards = new ArrayList(); - final GenomeLocParser parser = intervals.getGenomeLocParser(); - LinkedList currentIntervals = new LinkedList(); - - for(GenomeLoc interval: intervals) { - // if the next interval is too big, we can safely shard currentInterval and then break down this one - if (interval.size() > targetShardSize) { - if (!currentIntervals.isEmpty()) - shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); - while(interval.size() > targetShardSize) { - final GenomeLoc partialInterval = parser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getStart()+targetShardSize-1); - shards.add(createShardFromInterval(Collections.singletonList(partialInterval), readsDataSource, parser)); - interval = parser.createGenomeLoc(interval.getContig(), interval.getStart() + targetShardSize, interval.getStop()); - } - currentIntervals = new LinkedList(); - currentIntervals.add(interval); - } - // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly) - else { - if (currentIntervals.isEmpty()) { - currentIntervals.add(interval); - } - else { - if (currentIntervals.getLast().compareContigs(interval) != 0 || interval.getStop() - currentIntervals.getLast().getStart() + 1 > targetShardSize) { - shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); - currentIntervals = new LinkedList(); - } - currentIntervals.add(interval); - } - } - } - if (!currentIntervals.isEmpty()) - shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); - return shards; - } - - private static Shard createShardFromInterval(final List intervals, final SAMDataSource readsDataSource, final GenomeLocParser parser) { - //logger.debug("Adding shard " + interval); - return new LocusShard(parser, - readsDataSource, - intervals, - null); - } -*/ -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java deleted file mode 100644 index 762eb0b44..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPool.java +++ /dev/null @@ -1,153 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.rmd; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.broadinstitute.gatk.engine.refdata.SeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrack; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.FlashBackIterator; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.List; - -/** - * A pool of reference-ordered data iterators. - */ -class ReferenceOrderedDataPool extends ResourcePool { - // the reference-ordered data itself. - private final RMDTriplet fileDescriptor; - - // our tribble track builder - private final RMDTrackBuilder builder; - - /** - * The header from this RMD, if present. - */ - private final Object header; - - /** - * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. - */ - private final SAMSequenceDictionary sequenceDictionary; - - boolean flashbackData = false; - public ReferenceOrderedDataPool(RMDTriplet fileDescriptor,RMDTrackBuilder builder,SAMSequenceDictionary sequenceDictionary, GenomeLocParser genomeLocParser,boolean flashbackData) { - super(sequenceDictionary,genomeLocParser); - this.fileDescriptor = fileDescriptor; - this.builder = builder; - this.flashbackData = flashbackData; - - // prepopulate one RMDTrack - LocationAwareSeekableRODIterator iterator = createNewResource(); - this.addNewResource(iterator); - - // Pull the proper header and sequence dictionary from the prepopulated track. - this.header = iterator.getHeader(); - this.sequenceDictionary = iterator.getSequenceDictionary(); - } - - /** - * Gets the header used by this resource pool. - * @return Header used by this resource pool. - */ - public Object getHeader() { - return header; - } - - /** - * Gets the sequence dictionary built into the ROD index file. - * @return Sequence dictionary from the index file. - */ - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - /** - * Create a new iterator from the existing reference-ordered data. This new iterator is expected - * to be completely independent of any other iterator. - * @return The newly created resource. - */ - public LocationAwareSeekableRODIterator createNewResource() { - if(numIterators() > 0) - throw new ReviewedGATKException("BUG: Tried to create multiple iterators over streaming ROD interface"); - RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); - LocationAwareSeekableRODIterator iter = new SeekableRODIterator(track.getHeader(),track.getSequenceDictionary(),referenceSequenceDictionary,genomeLocParser,track.getIterator()); - return (flashbackData) ? new FlashBackIterator(iter) : iter; - } - - /** - * Finds the best existing ROD iterator from the pool. In this case, the best existing ROD is defined as - * the first one encountered that is at or before the given position. - * @param segment @{inheritedDoc} - * @param resources @{inheritedDoc} - * @return @{inheritedDoc} - */ - public LocationAwareSeekableRODIterator selectBestExistingResource( DataStreamSegment segment, List resources ) { - if(segment instanceof MappedStreamSegment) { - GenomeLoc position = ((MappedStreamSegment)segment).getLocation(); - - for( LocationAwareSeekableRODIterator RODIterator : resources ) { - - if( (RODIterator.position() == null && RODIterator.hasNext()) || - (RODIterator.position() != null && RODIterator.position().isBefore(position)) ) - return RODIterator; - if (RODIterator.position() != null && RODIterator instanceof FlashBackIterator && ((FlashBackIterator)RODIterator).canFlashBackTo(position)) { - ((FlashBackIterator)RODIterator).flashBackTo(position); - return RODIterator; - } - - } - return null; - } - else if(segment instanceof EntireStream) { - // Asking for a segment over the entire stream, so by definition, there is no best existing resource. - // Force the system to create a new one. - return null; - } - else { - throw new ReviewedGATKException("Unable to find a ROD iterator for segments of type " + segment.getClass()); - } - } - - /** - * In this case, the iterator is the resource. Pass it through. - */ - public LocationAwareSeekableRODIterator createIteratorFromResource( DataStreamSegment segment, LocationAwareSeekableRODIterator resource ) { - return resource; - } - - /** - * kill the buffers in the iterator - */ - public void closeResource( LocationAwareSeekableRODIterator resource ) { - if (resource instanceof FlashBackIterator) ((FlashBackIterator)resource).close(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java deleted file mode 100644 index 9d9e7c87f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataSource.java +++ /dev/null @@ -1,256 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.rmd; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.refdata.SeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrack; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.lang.reflect.Type; -import java.util.List; - -/** - * A data source which provides a single type of reference-ordered data. - */ -public class ReferenceOrderedDataSource { - /** - * The reference-ordered data itself. - */ - private final RMDTriplet fileDescriptor; - - /** - * The header associated with this VCF, if any. - */ - private final Object header; - - /** - * The private sequence dictionary associated with this RMD. - */ - private final SAMSequenceDictionary sequenceDictionary; - - /** - * The builder to use when constructing new reference-ordered data readers. - */ - private final RMDTrackBuilder builder; - - /** - * A pool of iterators for navigating through the genome. - */ - private final ResourcePool iteratorPool; - - /** - * Create a new reference-ordered data source. - */ - public ReferenceOrderedDataSource(RMDTriplet fileDescriptor, - RMDTrackBuilder builder, - SAMSequenceDictionary referenceSequenceDictionary, - GenomeLocParser genomeLocParser, - boolean flashbackData ) { - this.fileDescriptor = fileDescriptor; - this.builder = builder; - - // TODO: Unify the two blocks of code below by creating a ReferenceOrderedDataPool base class of a coherent type (not RMDTrack for one and SeekableIterator for the other). - if (fileDescriptor.getStorageType() != RMDTriplet.RMDStorageType.STREAM) { - iteratorPool = new ReferenceOrderedQueryDataPool(fileDescriptor, - builder, - referenceSequenceDictionary, - genomeLocParser); - this.header = ((ReferenceOrderedQueryDataPool)iteratorPool).getHeader(); - this.sequenceDictionary = ((ReferenceOrderedQueryDataPool)iteratorPool).getSequenceDictionary(); - } - else { - iteratorPool = new ReferenceOrderedDataPool(fileDescriptor, - builder, - referenceSequenceDictionary, - genomeLocParser, - flashbackData); - this.header = ((ReferenceOrderedDataPool)iteratorPool).getHeader(); - this.sequenceDictionary = ((ReferenceOrderedDataPool)iteratorPool).getSequenceDictionary(); - } - } - - /** - * Return the name of the underlying reference-ordered data. - * @return Name of the underlying rod. - */ - public String getName() { - return fileDescriptor.getName(); - } - - public Class getType() { - return builder.getFeatureManager().getByTriplet(fileDescriptor).getCodecClass(); - } - - public Class getRecordType() { - return builder.getFeatureManager().getByTriplet(fileDescriptor).getFeatureClass(); - } - - public File getFile() { - return new File(fileDescriptor.getFile()); - } - - public Object getHeader() { - return header; - } - - public Tags getTags() { - return fileDescriptor.getTags(); - } - - public String getTagValue( final String key ) { - return fileDescriptor.getTags().getValue( key ); - } - - - /** - * Retrieves the sequence dictionary created by this ROD. - * @return - */ - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - /** - * helper function for determining if we are the same track based on name and record type - * - * @param name the name to match - * @param type the type to match - * - * @return true on a match, false if the name or type is different - */ - public boolean matchesNameAndRecordType(String name, Type type) { - return (name.equals(fileDescriptor.getName()) && (type.getClass().isAssignableFrom(getType().getClass()))); - } - - /** - * Seek to the specified position and return an iterator through the data. - * - * @param loc GenomeLoc that points to the selected position. - * - * @return Iterator through the data. - */ - public LocationAwareSeekableRODIterator seek(GenomeLoc loc) { - DataStreamSegment dataStreamSegment = loc != null ? new MappedStreamSegment(loc) : new EntireStream(); - return iteratorPool.iterator(dataStreamSegment); - } - - - /** - * Close the specified iterator, returning it to the pool. - * @param iterator Iterator to close. - */ - public void close( LocationAwareSeekableRODIterator iterator ) { - iteratorPool.release(iterator); - } - -} - -/** - * a data pool for the new query based RODs - */ -class ReferenceOrderedQueryDataPool extends ResourcePool { - // the reference-ordered data itself. - private final RMDTriplet fileDescriptor; - - // our tribble track builder - private final RMDTrackBuilder builder; - - /** - * The header from this RMD, if present. - */ - private final Object header; - - /** - * The sequence dictionary from this ROD. If no sequence dictionary is present, this dictionary will be the same as the reference's. - */ - private final SAMSequenceDictionary sequenceDictionary; - - public ReferenceOrderedQueryDataPool(RMDTriplet fileDescriptor, RMDTrackBuilder builder, SAMSequenceDictionary referenceSequenceDictionary, GenomeLocParser genomeLocParser) { - super(referenceSequenceDictionary,genomeLocParser); - this.fileDescriptor = fileDescriptor; - this.builder = builder; - - // prepopulate one RMDTrack - RMDTrack track = builder.createInstanceOfTrack(fileDescriptor); - this.addNewResource(track); - - // Pull the proper header and sequence dictionary from the prepopulated track. - this.header = track.getHeader(); - this.sequenceDictionary = track.getSequenceDictionary(); - } - - public Object getHeader() { - return header; - } - - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - @Override - protected RMDTrack createNewResource() { - return builder.createInstanceOfTrack(fileDescriptor); - } - - @Override - protected RMDTrack selectBestExistingResource(DataStreamSegment segment, List availableResources) { - for (RMDTrack reader : availableResources) - if (reader != null) return reader; - return null; - } - - @Override - protected LocationAwareSeekableRODIterator createIteratorFromResource(DataStreamSegment position, RMDTrack track) { - try { - if (position instanceof MappedStreamSegment) { - GenomeLoc pos = ((MappedStreamSegment) position).locus; - return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.query(pos)); - } else { - return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator()); - } - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found"); - } catch (IOException e) { - throw new ReviewedGATKException("Unable to create iterator for rod named " + fileDescriptor.getName(),e); - } - } - - @Override - protected void closeResource(RMDTrack track) { - track.close(); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java deleted file mode 100644 index 0bcf4ee62..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtils.java +++ /dev/null @@ -1,369 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.collections.DefaultHashMap; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.text.XReadLines; -import htsjdk.variant.variantcontext.Allele; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -public class AlleleBiasedDownsamplingUtils { - - // define this class so that we can use Java generics below - private final static class PileupElementList extends ArrayList {} - - /** - * Computes an allele biased version of the given pileup - * - * @param pileup the original pileup - * @param downsamplingFraction the fraction of total reads to remove per allele - * @return allele biased pileup - */ - public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { - // special case removal of all or no reads - if ( downsamplingFraction <= 0.0 ) - return pileup; - if ( downsamplingFraction >= 1.0 ) - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); - - final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; - for ( int i = 0; i < 4; i++ ) - alleleStratifiedElements[i] = new PileupElementList(); - - // start by stratifying the reads by the alleles they represent at this position - for ( final PileupElement pe : pileup ) { - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); - if ( baseIndex != -1 ) - alleleStratifiedElements[baseIndex].add(pe); - } - - // make a listing of allele counts and calculate the total count - final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements); - final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); - - // do smart down-sampling - final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor - final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); - - final HashSet readsToRemove = new HashSet(numReadsToRemove); - for ( int i = 0; i < 4; i++ ) { - final PileupElementList alleleList = alleleStratifiedElements[i]; - // if we don't need to remove any reads, then don't - if ( alleleCounts[i] > targetAlleleCounts[i] ) - readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); - } - - // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise - final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); - for ( final PileupElement pe : pileup ) { - if ( !readsToRemove.contains(pe) ) { - readsToKeep.add(pe); - } - } - - return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); - } - - /** - * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) - * - * @param alleleStratifiedElements pileup elements stratified by allele - * @return non-null int array representing allele counts - */ - private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements) { - final int[] alleleCounts = new int[alleleStratifiedElements.length]; - for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { - alleleCounts[i] = alleleStratifiedElements[i].size(); - } - return alleleCounts; - } - - private static int scoreAlleleCounts(final int[] alleleCounts) { - if ( alleleCounts.length < 2 ) - return 0; - - // sort the counts (in ascending order) - final int[] alleleCountsCopy = alleleCounts.clone(); - Arrays.sort(alleleCountsCopy); - - final int maxCount = alleleCountsCopy[alleleCounts.length - 1]; - final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2]; - - int remainderCount = 0; - for ( int i = 0; i < alleleCounts.length - 2; i++ ) - remainderCount += alleleCountsCopy[i]; - - // try to get the best score: - // - in the het case the counts should be equal with nothing else - // - in the hom case the non-max should be zero - return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount)); - } - - /** - * Computes an allele biased version of the allele counts for a given pileup - * - * @param alleleCounts the allele counts for the original pileup - * @param numReadsToRemove number of total reads to remove per allele - * @return non-null array of new counts needed per allele - */ - protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { - final int numAlleles = alleleCounts.length; - - int maxScore = scoreAlleleCounts(alleleCounts); - int[] alleleCountsOfMax = alleleCounts; - - final int numReadsToRemovePerAllele = numReadsToRemove / 2; - - for ( int i = 0; i < numAlleles; i++ ) { - for ( int j = i; j < numAlleles; j++ ) { - final int[] newCounts = alleleCounts.clone(); - - // split these cases so we don't lose on the floor (since we divided by 2) - if ( i == j ) { - newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); - } else { - newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); - newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); - } - - final int score = scoreAlleleCounts(newCounts); - - if ( score < maxScore ) { - maxScore = score; - alleleCountsOfMax = newCounts; - } - } - } - - return alleleCountsOfMax; - } - - /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to remove - * - * @param elements original list of pileup elements - * @param originalElementCount original count of elements (taking reduced reads into account) - * @param numElementsToRemove the number of records to remove - * @return the list of pileup elements TO REMOVE - */ - protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { - // are there no elements to remove? - if ( numElementsToRemove == 0 ) - return Collections.emptyList(); - - final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - - // should we remove all of the elements? - if ( numElementsToRemove >= originalElementCount ) { - elementsToRemove.addAll(elements); - return elementsToRemove; - } - - // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(originalElementCount); - for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - int currentBitSetIndex = 0; - for ( final PileupElement element : elements ) { - if ( itemsToRemove.get(currentBitSetIndex++) ) { - elementsToRemove.add(element); - } - } - - return elementsToRemove; - } - - /** - * Computes reads to remove based on an allele biased down-sampling - * - * @param alleleReadMap original list of records per allele - * @param downsamplingFraction the fraction of total reads to remove per allele - * @return list of reads TO REMOVE from allele biased down-sampling - */ - public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { - int totalReads = 0; - for ( final List reads : alleleReadMap.values() ) - totalReads += reads.size(); - - int numReadsToRemove = (int)(totalReads * downsamplingFraction); - - // make a listing of allele counts - final List alleles = new ArrayList(alleleReadMap.keySet()); - alleles.remove(Allele.NO_CALL); // ignore the no-call bin - final int numAlleles = alleles.size(); - - final int[] alleleCounts = new int[numAlleles]; - for ( int i = 0; i < numAlleles; i++ ) - alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); - - // do smart down-sampling - final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); - - final List readsToRemove = new ArrayList(numReadsToRemove); - for ( int i = 0; i < numAlleles; i++ ) { - if ( alleleCounts[i] > targetAlleleCounts[i] ) { - readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); - } - } - - return readsToRemove; - } - - /** - * Performs allele biased down-sampling on a pileup and computes the list of elements to remove - * - * @param reads original list of records - * @param numElementsToRemove the number of records to remove - * @return the list of pileup elements TO REMOVE - */ - protected static List downsampleElements(final List reads, final int numElementsToRemove) { - // are there no elements to remove? - if ( numElementsToRemove == 0 ) - return Collections.emptyList(); - - final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); - final int originalElementCount = reads.size(); - - // should we remove all of the elements? - if ( numElementsToRemove >= originalElementCount ) { - elementsToRemove.addAll(reads); - return elementsToRemove; - } - - // create a bitset describing which elements to remove - final BitSet itemsToRemove = new BitSet(originalElementCount); - for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { - itemsToRemove.set(selectedIndex); - } - - int currentBitSetIndex = 0; - for ( final GATKSAMRecord read : reads ) { - if ( itemsToRemove.get(currentBitSetIndex++) ) - elementsToRemove.add(read); - } - - return elementsToRemove; - } - - /** - * Create sample-contamination maps from file - * - * @param ContaminationFractionFile Filename containing two columns: SampleID and Contamination - * @param AvailableSampleIDs Set of Samples of interest (no reason to include every sample in file) or null to turn off checking - * @param logger for logging output - * @return sample-contamination Map - */ - - public static DefaultHashMap loadContaminationFile(File ContaminationFractionFile, final Double defaultContaminationFraction, final Set AvailableSampleIDs, Logger logger) throws GATKException { - DefaultHashMap sampleContamination = new DefaultHashMap(defaultContaminationFraction); - Set nonSamplesInContaminationFile = new HashSet(sampleContamination.keySet()); - try { - - XReadLines reader = new XReadLines(ContaminationFractionFile, true); - for (String line : reader) { - - if (line.length() == 0) { - continue; - } - - StringTokenizer st = new StringTokenizer(line,"\t"); - - String fields[] = new String[2]; - try { - fields[0] = st.nextToken(); - fields[1] = st.nextToken(); - } catch(NoSuchElementException e){ - throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); - } - if(st.hasMoreTokens()) { - throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); - } - - if (fields[0].length() == 0 || fields[1].length() == 0) { - throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line); - } - - if (sampleContamination.containsKey(fields[0])) { - throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + fields[0]); - } - - try { - final Double contamination = Double.valueOf(fields[1]); - if (contamination < 0 || contamination > 1){ - throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line); - } - if (AvailableSampleIDs==null || AvailableSampleIDs.contains(fields[0])) {// only add samples if they are in the sampleSet (or if it is null) - sampleContamination.put(fields[0], contamination); - } - else { - nonSamplesInContaminationFile.add(fields[0]); - } - } catch (NumberFormatException e) { - throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line); - } - } - - - //output to the user info lines telling which samples are in the Contamination File - if (sampleContamination.size() > 0) { - logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString())); - - //output to the user info lines telling which samples are NOT in the Contamination File - if(AvailableSampleIDs!=null){ - Set samplesNotInContaminationFile = new HashSet(AvailableSampleIDs); - samplesNotInContaminationFile.removeAll(sampleContamination.keySet()); - if (samplesNotInContaminationFile.size() > 0) - logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString())); - } - } - - //output to the user Samples that do not have lines in the Contamination File - if (nonSamplesInContaminationFile.size() > 0) { - logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString())); - } - - return sampleContamination; - - } catch (IOException e) { - throw new GATKException("I/O Error while reading sample-contamination file " + ContaminationFractionFile.getName() + ": " + e.getMessage()); - } - - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsampleType.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsampleType.java deleted file mode 100644 index 715ef6eed..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsampleType.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -/** - * Type of downsampling method to invoke. - * - * @author hanna - * @version 0.1 - */ - -public enum DownsampleType { - NONE, - ALL_READS, - BY_SAMPLE -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/Downsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/Downsampler.java deleted file mode 100644 index 8ab0198b1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/Downsampler.java +++ /dev/null @@ -1,161 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import java.util.Collection; -import java.util.List; - -/** - * The basic downsampler API, with no reads-specific operations. - * - * Downsamplers that extend this class rather than the ReadsDownsampler class can handle - * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a - * PerSampleDownsamplingReadsIterator. - * - * @author David Roazen - */ -public abstract class Downsampler { - - /** - * Number of items discarded by this downsampler since the last call to resetStats() - */ - protected int numDiscardedItems = 0; - - /** - * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine - * immediately whether the item survives the downsampling process, while others will need to see - * more items before making that determination. - * - * @param item the individual item to submit to the downsampler for consideration - */ - public abstract void submit( final T item ); - - /** - * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling - * submit() on each individual item in the collection. - * - * @param items the collection of items to submit to the downsampler for consideration - */ - public void submit( final Collection items ) { - if ( items == null ) { - throw new IllegalArgumentException("submitted items must not be null"); - } - - for ( final T item : items ) { - submit(item); - } - } - - /** - * Are there items that have survived the downsampling process waiting to be retrieved? - * - * @return true if this downsampler has > 0 finalized items, otherwise false - */ - public abstract boolean hasFinalizedItems(); - - /** - * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. - * - * @return a list of all finalized items this downsampler contains, or an empty list if there are none - */ - public abstract List consumeFinalizedItems(); - - /** - * Are there items stored in this downsampler that it doesn't yet know whether they will - * ultimately survive the downsampling process? - * - * @return true if this downsampler has > 0 pending items, otherwise false - */ - public abstract boolean hasPendingItems(); - - /** - * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) - * - * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), - * or null if there are none - */ - public abstract T peekFinalized(); - - /** - * Peek at the first pending item stored in this downsampler (or null if there are no pending items) - * - * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), - * or null if there are none - */ - public abstract T peekPending(); - - /** - * Get the current number of items in this downsampler - * - * This should be the best estimate of the total number of elements that will come out of the downsampler - * were consumeFinalizedItems() to be called immediately after this call. In other words it should - * be number of finalized items + estimate of number of pending items that will ultimately be included as well. - * - * @return a positive integer - */ - public abstract int size(); - - /** - * Returns the number of items discarded (so far) during the downsampling process - * - * @return the number of items that have been submitted to this downsampler and discarded in the process of - * downsampling - */ - public int getNumberOfDiscardedItems() { - return numDiscardedItems; - } - - /** - * Used to tell the downsampler that no more items will be submitted to it, and that it should - * finalize any pending items. - */ - public abstract void signalEndOfInput(); - - /** - * Empty the downsampler of all finalized/pending items - */ - public abstract void clearItems(); - - /** - * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items - */ - public void resetStats() { - numDiscardedItems = 0; - } - - /** - * Indicates whether an item should be excluded from elimination during downsampling. By default, - * all items representing reduced reads are excluded from downsampling, but individual downsamplers - * may override if they are able to handle reduced reads correctly. Downsamplers should check - * the return value of this method before discarding an item. - * - * @param item The item to test - * @return true if the item should not be subject to elimination during downsampling, otherwise false - */ - protected boolean doNotDiscardItem( final Object item ) { - return false; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingMethod.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingMethod.java deleted file mode 100644 index 94a3cc74b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingMethod.java +++ /dev/null @@ -1,142 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; -import org.broadinstitute.gatk.engine.walkers.LocusWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -/** - * Describes the method for downsampling reads at a given locus. - */ - -public class DownsamplingMethod { - /** - * Type of downsampling to perform. - */ - public final DownsampleType type; - - /** - * Actual downsampling target is specified as an integer number of reads. - */ - public final Integer toCoverage; - - /** - * Actual downsampling target is specified as a fraction of total available reads. - */ - public final Double toFraction; - - /** - * Expresses no downsampling applied at all. - */ - public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE, null, null); - - /** - * Default type to use if no type is specified - */ - public static final DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; - - /** - * Don't allow dcov values below this threshold for locus-based traversals (ie., Locus - * and ActiveRegion walkers), as they can result in problematic downsampling artifacts - */ - public static final int MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS = 200; - - - public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction ) { - this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; - - if ( type == DownsampleType.NONE ) { - this.toCoverage = null; - this.toFraction = null; - } - else { - this.toCoverage = toCoverage; - this.toFraction = toFraction; - } - - validate(); - } - - private void validate() { - // Can't leave toFraction and toCoverage null unless type is NONE - if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) - throw new UserException("Must specify either toFraction or toCoverage when downsampling."); - - // Fraction and coverage cannot both be specified. - if ( toFraction != null && toCoverage != null ) - throw new UserException("Downsampling coverage and fraction are both specified. Please choose only one."); - - // toCoverage must be > 0 when specified - if ( toCoverage != null && toCoverage <= 0 ) { - throw new UserException("toCoverage must be > 0 when downsampling to coverage"); - } - - // toFraction must be >= 0.0 and <= 1.0 when specified - if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { - throw new UserException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); - } - } - - public void checkCompatibilityWithWalker( Walker walker ) { - boolean isLocusTraversal = walker instanceof LocusWalker || walker instanceof ActiveRegionWalker; - - if ( isLocusTraversal && type == DownsampleType.ALL_READS && toCoverage != null ) { - throw new UserException("Downsampling to coverage with the ALL_READS method for locus-based traversals (eg., LocusWalkers) is not currently supported (though it is supported for ReadWalkers)."); - } - - // For locus traversals, ensure that the dcov value (if present) is not problematically low - if ( isLocusTraversal && type != DownsampleType.NONE && toCoverage != null && - toCoverage < MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS ) { - throw new UserException(String.format("Locus-based traversals (ie., Locus and ActiveRegion walkers) require " + - "a minimum -dcov value of %d when downsampling to coverage. Values less " + - "than this can produce problematic downsampling artifacts while providing " + - "only insignificant improvements in memory usage in most cases.", - MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS)); - } - } - - public String toString() { - StringBuilder builder = new StringBuilder("Downsampling Settings: "); - - if ( type == DownsampleType.NONE ) { - builder.append("No downsampling"); - } - else { - builder.append(String.format("Method: %s, ", type)); - - if ( toCoverage != null ) { - builder.append(String.format("Target Coverage: %d", toCoverage)); - } - else { - builder.append(String.format("Target Fraction: %.2f", toFraction)); - } - } - - return builder.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIterator.java deleted file mode 100644 index 6b398aba2..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIterator.java +++ /dev/null @@ -1,116 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; - -import java.util.Collection; -import java.util.Iterator; -import java.util.NoSuchElementException; - - -/** - * GATKSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style - * downsampler interface to a pull model. - * - * @author David Roazen - */ -public class DownsamplingReadsIterator implements GATKSAMIterator { - - private GATKSAMIterator nestedSAMIterator; - private ReadsDownsampler downsampler; - private Collection downsampledReadsCache; - private SAMRecord nextRead = null; - private Iterator downsampledReadsCacheIterator = null; - - /** - * @param iter wrapped iterator from which this iterator will pull reads - * @param downsampler downsampler through which the reads will be fed - */ - public DownsamplingReadsIterator( GATKSAMIterator iter, ReadsDownsampler downsampler ) { - nestedSAMIterator = iter; - this.downsampler = downsampler; - - advanceToNextRead(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if ( nextRead == null ) { - throw new NoSuchElementException("next() called when there are no more items"); - } - - SAMRecord toReturn = nextRead; - advanceToNextRead(); - - return toReturn; - } - - private void advanceToNextRead() { - if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { - nextRead = null; - } - else { - nextRead = downsampledReadsCacheIterator.next(); - } - } - - private boolean readyToReleaseReads() { - return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext(); - } - - private boolean fillDownsampledReadsCache() { - while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) { - downsampler.submit(nestedSAMIterator.next()); - } - - if ( ! nestedSAMIterator.hasNext() ) { - downsampler.signalEndOfInput(); - } - - // use returned collection directly rather than make a copy, for speed - downsampledReadsCache = downsampler.consumeFinalizedItems(); - downsampledReadsCacheIterator = downsampledReadsCache.iterator(); - - return downsampledReadsCacheIterator.hasNext(); - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - public void close() { - nestedSAMIterator.close(); - } - - public Iterator iterator() { - return this; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingUtils.java deleted file mode 100644 index bd236c0bc..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingUtils.java +++ /dev/null @@ -1,107 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.util.*; - -/** - * Utilities for using the downsamplers for common tasks - * - * User: depristo - * Date: 3/6/13 - * Time: 4:26 PM - */ -public class DownsamplingUtils { - private DownsamplingUtils() { } - - /** - * Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing - * coverage at any read start to less than minReadsPerAlignmentStart - * - * This algorithm can be used to handle the situation where you have lots of coverage in some interval, and - * want to reduce the coverage of the big peak down without removing the many reads at the edge of this - * interval that are in fact good - * - * This algorithm separately operates on the reads for each sample independently. - * - * @param reads a sorted list of reads - * @param downsampleTo the targeted number of reads we want from reads per sample - * @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start - * to below this. That is, if this value is 2, we'll never reduce the number - * of reads starting at a specific start site to less than 2 - * @return a sorted list of reads - */ - public static List levelCoverageByPosition(final List reads, final int downsampleTo, final int minReadsPerAlignmentStart) { - if ( reads == null ) throw new IllegalArgumentException("reads must not be null"); - - final List downsampled = new ArrayList(reads.size()); - - final Map>> readsBySampleByStart = partitionReadsBySampleAndStart(reads); - for ( final Map> readsByPosMap : readsBySampleByStart.values() ) { - final LevelingDownsampler, GATKSAMRecord> downsampler = new LevelingDownsampler, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart); - downsampler.submit(readsByPosMap.values()); - downsampler.signalEndOfInput(); - for ( final List downsampledReads : downsampler.consumeFinalizedItems()) - downsampled.addAll(downsampledReads); - } - - return ReadUtils.sortReadsByCoordinate(downsampled); - } - - /** - * Build the data structure mapping for each sample -> (position -> reads at position) - * - * Note that the map position -> reads isn't ordered in any meaningful way - * - * @param reads a list of sorted reads - * @return a map containing the list of reads at each start location, for each sample independently - */ - private static Map>> partitionReadsBySampleAndStart(final List reads) { - final Map>> readsBySampleByStart = new LinkedHashMap>>(); - - for ( final GATKSAMRecord read : reads ) { - Map> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample()); - - if ( readsByStart == null ) { - readsByStart = new LinkedHashMap>(); - readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart); - } - - List readsAtStart = readsByStart.get(read.getAlignmentStart()); - if ( readsAtStart == null ) { - readsAtStart = new LinkedList(); - readsByStart.put(read.getAlignmentStart(), readsAtStart); - } - - readsAtStart.add(read); - } - - return readsBySampleByStart; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsampler.java deleted file mode 100644 index a2d613c5f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsampler.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.ArrayList; -import java.util.List; - -/** - * Fractional Downsampler: selects a specified fraction of the reads for inclusion. - * - * Since the selection is done randomly, the actual fraction of reads retained may be slightly - * more or less than the requested fraction, depending on the total number of reads submitted. - * - * @author David Roazen - */ -public class FractionalDownsampler extends ReadsDownsampler { - - private ArrayList selectedReads; - - private final int cutoffForInclusion; - - private static final int RANDOM_POOL_SIZE = 10000; - - /** - * Construct a FractionalDownsampler - * - * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive). - * Actual number of reads preserved may differ randomly. - */ - public FractionalDownsampler( final double fraction ) { - if ( fraction < 0.0 || fraction > 1.0 ) { - throw new ReviewedGATKException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); - } - - cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); - clearItems(); - resetStats(); - } - - @Override - public void submit( final T newRead ) { - if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion || doNotDiscardItem(newRead) ) { - selectedReads.add(newRead); - } - else { - numDiscardedItems++; - } - } - - @Override - public boolean hasFinalizedItems() { - return selectedReads.size() > 0; - } - - @Override - public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - List downsampledItems = selectedReads; - clearItems(); - return downsampledItems; - } - - @Override - public boolean hasPendingItems() { - return false; - } - - @Override - public T peekFinalized() { - return selectedReads.isEmpty() ? null : selectedReads.get(0); - } - - @Override - public T peekPending() { - return null; - } - - @Override - public int size() { - return selectedReads.size(); - } - - @Override - public void signalEndOfInput() { - // NO-OP - } - - @Override - public void clearItems() { - selectedReads = new ArrayList(); - } - - @Override - public boolean requiresCoordinateSortOrder() { - return false; - } - - @Override - public void signalNoMoreReadsBefore( final T read ) { - // NO-OP - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerFactory.java deleted file mode 100644 index 4ddf8dd87..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * Factory for creating FractionalDownsamplers on demand - * - * @author David Roazen - */ -public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { - - private double fraction; - - public FractionalDownsamplerFactory( double fraction ) { - this.fraction = fraction; - } - - public ReadsDownsampler newInstance() { - return new FractionalDownsampler(fraction); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsampler.java deleted file mode 100644 index 4ae7bc581..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsampler.java +++ /dev/null @@ -1,242 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.utils.MathUtils; - -import java.util.*; - -/** - * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from - * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling - * does not occur until all Lists have been submitted and signalEndOfInput() is called. - * - * The Lists should be LinkedLists for maximum efficiency during item removal, however other - * kinds of Lists are also accepted (albeit at a slight performance penalty). - * - * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, - * the Lists need not contain reads. However this downsampler may not be wrapped within one of the - * DownsamplingReadsIterators - * - * @param the List type representing the stacks to be leveled - * @param the type of the elements of each List - * - * @author David Roazen - */ -public class LevelingDownsampler, E> extends Downsampler { - private final int minElementsPerStack; - - private final int targetSize; - - private List groups; - - private boolean groupsAreFinalized; - - /** - * Construct a LevelingDownsampler - * - * Uses the default minElementsPerStack of 1 - * - * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed - * this value -- if it does, items are removed from Lists evenly until the total size - * is <= this value - */ - public LevelingDownsampler( final int targetSize ) { - this(targetSize, 1); - } - - /** - * Construct a LevelingDownsampler - * - * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed - * this value -- if it does, items are removed from Lists evenly until the total size - * is <= this value - * @param minElementsPerStack no stack will be reduced below this size during downsampling. That is, - * if a stack has only 3 elements and minElementsPerStack is 3, no matter what - * we'll not reduce this stack below 3. - */ - public LevelingDownsampler( final int targetSize, final int minElementsPerStack ) { - if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); - if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); - - this.targetSize = targetSize; - this.minElementsPerStack = minElementsPerStack; - clearItems(); - resetStats(); - } - - @Override - public void submit( final T item ) { - groups.add(item); - } - - @Override - public void submit( final Collection items ){ - groups.addAll(items); - } - - @Override - public boolean hasFinalizedItems() { - return groupsAreFinalized && groups.size() > 0; - } - - @Override - public List consumeFinalizedItems() { - if ( ! hasFinalizedItems() ) { - return new ArrayList(); - } - - // pass by reference rather than make a copy, for speed - final List toReturn = groups; - clearItems(); - return toReturn; - } - - @Override - public boolean hasPendingItems() { - return ! groupsAreFinalized && groups.size() > 0; - } - - @Override - public T peekFinalized() { - return hasFinalizedItems() ? groups.get(0) : null; - } - - @Override - public T peekPending() { - return hasPendingItems() ? groups.get(0) : null; - } - - @Override - public int size() { - int s = 0; - for ( final List l : groups ) { - s += l.size(); - } - return s; - } - - @Override - public void signalEndOfInput() { - levelGroups(); - groupsAreFinalized = true; - } - - @Override - public void clearItems() { - groups = new ArrayList(); - groupsAreFinalized = false; - } - - private void levelGroups() { - final int[] groupSizes = new int[groups.size()]; - int totalSize = 0; - int currentGroupIndex = 0; - - for ( final T group : groups ) { - groupSizes[currentGroupIndex] = group.size(); - totalSize += groupSizes[currentGroupIndex]; - currentGroupIndex++; - } - - if ( totalSize <= targetSize ) { - return; // no need to eliminate any items - } - - // We will try to remove exactly this many items, however we will refuse to allow any - // one group to fall below size 1, and so might end up removing fewer items than this - int numItemsToRemove = totalSize - targetSize; - - currentGroupIndex = 0; - int numConsecutiveUmodifiableGroups = 0; - - // Continue until we've either removed all the items we wanted to, or we can't - // remove any more items without violating the constraint that all groups must - // be left with at least one item - while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { - if ( groupSizes[currentGroupIndex] > minElementsPerStack ) { - groupSizes[currentGroupIndex]--; - numItemsToRemove--; - numConsecutiveUmodifiableGroups = 0; - } - else { - numConsecutiveUmodifiableGroups++; - } - - currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; - } - - // Now we actually go through and reduce each group to its new count as specified in groupSizes - currentGroupIndex = 0; - for ( final T group : groups ) { - downsampleOneGroup(group, groupSizes[currentGroupIndex]); - currentGroupIndex++; - } - } - - private void downsampleOneGroup( final T group, final int numItemsToKeep ) { - if ( numItemsToKeep >= group.size() ) { - return; - } - - final BitSet itemsToKeep = new BitSet(group.size()); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { - itemsToKeep.set(selectedIndex); - } - - int currentIndex = 0; - - // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator - if ( group instanceof LinkedList ) { - final Iterator iter = group.iterator(); - while ( iter.hasNext() ) { - final E item = iter.next(); - - if ( ! itemsToKeep.get(currentIndex) && ! doNotDiscardItem(item) ) { - iter.remove(); - numDiscardedItems++; - } - - currentIndex++; - } - } - // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather - // than suffer O(n^2) of item shifting - else { - final List keptItems = new ArrayList(group.size()); - - for ( final E item : group ) { - if ( itemsToKeep.get(currentIndex) || doNotDiscardItem(item) ) { - keptItems.add(item); - } - currentIndex++; - } - numDiscardedItems += group.size() - keptItems.size(); - group.clear(); - group.addAll(keptItems); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PassThroughDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PassThroughDownsampler.java deleted file mode 100644 index a5fdf24a9..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PassThroughDownsampler.java +++ /dev/null @@ -1,111 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -import java.util.LinkedList; -import java.util.List; - -/** - * Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no - * downsampling whatsoever, and instead simply "passes-through" all the reads it's given. - * Useful for situations where you want to disable downsampling, but still need to use - * the downsampler interface. - * - * @author David Roazen - */ -public class PassThroughDownsampler extends ReadsDownsampler { - - private LinkedList selectedReads; - - public PassThroughDownsampler() { - clearItems(); - } - - @Override - public void submit( T newRead ) { - // All reads pass-through, no reads get downsampled - selectedReads.add(newRead); - } - - @Override - public boolean hasFinalizedItems() { - return ! selectedReads.isEmpty(); - } - - /** - * Note that this list is a linked list and so doesn't support fast random access - * @return - */ - @Override - public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - final List downsampledItems = selectedReads; - clearItems(); - return downsampledItems; - } - - @Override - public boolean hasPendingItems() { - return false; - } - - @Override - public T peekFinalized() { - return selectedReads.isEmpty() ? null : selectedReads.getFirst(); - } - - @Override - public T peekPending() { - return null; - } - - @Override - public int size() { - return selectedReads.size(); - } - - @Override - public void signalEndOfInput() { - // NO-OP - } - - @Override - public void clearItems() { - selectedReads = new LinkedList(); - } - - @Override - public boolean requiresCoordinateSortOrder() { - return false; - } - - @Override - public void signalNoMoreReadsBefore( T read ) { - // NO-OP - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIterator.java deleted file mode 100644 index 118bbbbeb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIterator.java +++ /dev/null @@ -1,207 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordComparator; -import htsjdk.samtools.SAMRecordCoordinateComparator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; - -import java.util.*; - - -/** - * GATKSAMIterator wrapper around our generic reads downsampler interface - * that downsamples reads for each sample independently, and then re-assembles - * the reads back into a single merged stream. - * - * @author David Roazen - */ -public class PerSampleDownsamplingReadsIterator implements GATKSAMIterator { - - private GATKSAMIterator nestedSAMIterator; - private ReadsDownsamplerFactory downsamplerFactory; - private Map> perSampleDownsamplers; - private PriorityQueue orderedDownsampledReadsCache; - private SAMRecord nextRead = null; - private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); - private SAMRecord earliestPendingRead = null; - private ReadsDownsampler earliestPendingDownsampler = null; - - // Initial size of our cache of finalized reads - private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; - - // The number of positional changes that can occur in the read stream before all downsamplers - // should be informed of the current position (guards against samples with relatively sparse reads - // getting stuck in a pending state): - private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value - - /** - * @param iter wrapped iterator from which this iterator will pull reads - * @param downsamplerFactory factory used to create new downsamplers as needed - */ - public PerSampleDownsamplingReadsIterator( GATKSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { - nestedSAMIterator = iter; - this.downsamplerFactory = downsamplerFactory; - perSampleDownsamplers = new HashMap>(); - orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); - - advanceToNextRead(); - } - - public boolean hasNext() { - return nextRead != null; - } - - public SAMRecord next() { - if ( nextRead == null ) { - throw new NoSuchElementException("next() called when there are no more items"); - } - - SAMRecord toReturn = nextRead; - advanceToNextRead(); - - return toReturn; - } - - private void advanceToNextRead() { - if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { - nextRead = null; - } - else { - nextRead = orderedDownsampledReadsCache.poll(); - } - } - - private boolean readyToReleaseReads() { - if ( orderedDownsampledReadsCache.isEmpty() ) { - return false; - } - - return earliestPendingRead == null || - readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; - } - - private boolean fillDownsampledReadsCache() { - SAMRecord prevRead = null; - int numPositionalChanges = 0; - - // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue - // can be released without violating global sort order - while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { - SAMRecord read = nestedSAMIterator.next(); - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - - ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); - if ( thisSampleDownsampler == null ) { - thisSampleDownsampler = downsamplerFactory.newInstance(); - perSampleDownsamplers.put(sampleName, thisSampleDownsampler); - } - - thisSampleDownsampler.submit(read); - processFinalizedAndPendingItems(thisSampleDownsampler); - - if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { - numPositionalChanges++; - } - - // Periodically inform all downsamplers of the current position in the read stream. This is - // to prevent downsamplers for samples with sparser reads than others from getting stuck too - // long in a pending state. - if ( numPositionalChanges > 0 && numPositionalChanges % DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL == 0 ) { - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - perSampleDownsampler.signalNoMoreReadsBefore(read); - processFinalizedAndPendingItems(perSampleDownsampler); - } - } - - prevRead = read; - } - - if ( ! nestedSAMIterator.hasNext() ) { - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - perSampleDownsampler.signalEndOfInput(); - if ( perSampleDownsampler.hasFinalizedItems() ) { - orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); - } - } - earliestPendingRead = null; - earliestPendingDownsampler = null; - } - - return readyToReleaseReads(); - } - - private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { - // If there is no recorded earliest pending read and this downsampler has pending items, - // then this downsampler's first pending item becomes the new earliest pending read: - if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { - earliestPendingRead = currentDownsampler.peekPending(); - earliestPendingDownsampler = currentDownsampler; - } - // In all other cases, we only need to update the earliest pending read when the downsampler - // associated with it experiences a change in its pending reads, since by assuming a sorted - // read stream we're assured that each downsampler's earliest pending read will only increase - // in genomic position over time. - // - // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers - // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), - // TODO: but need to verify this empirically. - else if ( currentDownsampler == earliestPendingDownsampler && - (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { - - earliestPendingRead = null; - earliestPendingDownsampler = null; - for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { - if ( perSampleDownsampler.hasPendingItems() && - (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { - - earliestPendingRead = perSampleDownsampler.peekPending(); - earliestPendingDownsampler = perSampleDownsampler; - } - } - } - } - - private void processFinalizedAndPendingItems( ReadsDownsampler currentDownsampler ) { - if ( currentDownsampler.hasFinalizedItems() ) { - orderedDownsampledReadsCache.addAll(currentDownsampler.consumeFinalizedItems()); - } - updateEarliestPendingRead(currentDownsampler); - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - public void close() { - nestedSAMIterator.close(); - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsampler.java deleted file mode 100644 index 9263920f9..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsampler.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * An extension of the basic downsampler API with reads-specific operations - * - * @author David Roazen - */ -public abstract class ReadsDownsampler extends Downsampler { - - /** - * Does this downsampler require that reads be fed to it in coordinate order? - * - * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false - */ - public abstract boolean requiresCoordinateSortOrder(); - - /** - * Tell this downsampler that no more reads located before the provided read (according to - * the sort order of the read stream) will be fed to it. - * - * Allows position-aware downsamplers to finalize pending reads earlier than they would - * otherwise be able to, particularly when doing per-sample downsampling and reads for - * certain samples are sparser than average. - * - * @param read the downsampler will assume that no reads located before this read will ever - * be submitted to it in the future - */ - public abstract void signalNoMoreReadsBefore( final T read ); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsamplerFactory.java deleted file mode 100644 index 9ef847e67..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReadsDownsamplerFactory.java +++ /dev/null @@ -1,38 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular - * downsampler, all sharing the same construction parameters. - * - * @author David Roazen - */ -public interface ReadsDownsamplerFactory { - public ReadsDownsampler newInstance(); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsampler.java deleted file mode 100644 index 99a0bbd7a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsampler.java +++ /dev/null @@ -1,219 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with - * every read in the stream having an equal chance of being selected for inclusion. - * - * An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985) - * - * @author David Roazen - */ -public class ReservoirDownsampler extends ReadsDownsampler { - - /** - * size of our reservoir -- ie., the maximum number of reads from the stream that will be retained - * (not including any undiscardable items) - */ - private final int targetSampleSize; - - /** - * if true, this downsampler will be optimized for the case - * where most of the time we won't fill up anything like the - * targetSampleSize elements. If this is false, we will allocate - * internal buffers to targetSampleSize initially, which minimizes - * the cost of allocation if we often use targetSampleSize or more - * elements. - */ - private final boolean expectFewOverflows; - - /** - * At times this can be a linked list or an array list, depending on how we're accessing the - * data and whether or not we're expecting few overflows - */ - private List reservoir; - - /** - * Certain items (eg., reduced reads) cannot be discarded at all during downsampling. We store - * these items separately so as not to impact the fair selection of items for inclusion in the - * reservoir. These items are returned (and cleared) along with any items in the reservoir in - * calls to consumeFinalizedItems(). - */ - private List undiscardableItems; - - /** - * Are we currently using a linked list for the reservoir? - */ - private boolean isLinkedList; - - /** - * Count of the number of reads seen that were actually eligible for discarding. Used by the reservoir downsampling - * algorithm to ensure that all discardable reads have an equal chance of making it into the reservoir. - */ - private int totalDiscardableReadsSeen; - - - /** - * Construct a ReservoirDownsampler - * - * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained - * after downsampling will be min(totalDiscardableReads, targetSampleSize) + any - * undiscardable reads (eg., reduced reads). - * - * @param expectFewOverflows if true, this downsampler will be optimized for the case - * where most of the time we won't fill up anything like the - * targetSampleSize elements. If this is false, we will allocate - * internal buffers to targetSampleSize initially, which minimizes - * the cost of allocation if we often use targetSampleSize or more - * elements. - */ - public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows ) { - if ( targetSampleSize <= 0 ) { - throw new ReviewedGATKException("Cannot do reservoir downsampling with a sample size <= 0"); - } - - this.targetSampleSize = targetSampleSize; - this.expectFewOverflows = expectFewOverflows; - clearItems(); - resetStats(); - } - - /** - * Construct a ReservoirDownsampler - * - * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained - * after downsampling will be min(totalReads, targetSampleSize) - */ - public ReservoirDownsampler ( final int targetSampleSize ) { - this(targetSampleSize, false); - } - - @Override - public void submit ( final T newRead ) { - if ( doNotDiscardItem(newRead) ) { - undiscardableItems.add(newRead); - return; - } - - // Only count reads that are actually eligible for discarding for the purposes of the reservoir downsampling algorithm - totalDiscardableReadsSeen++; - - if ( totalDiscardableReadsSeen <= targetSampleSize ) { - reservoir.add(newRead); - } - else { - if ( isLinkedList ) { - reservoir = new ArrayList(reservoir); - isLinkedList = false; - } - - final int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalDiscardableReadsSeen); - if ( randomSlot < targetSampleSize ) { - reservoir.set(randomSlot, newRead); - } - numDiscardedItems++; - } - } - - @Override - public boolean hasFinalizedItems() { - return ! reservoir.isEmpty() || ! undiscardableItems.isEmpty(); - } - - @Override - public List consumeFinalizedItems() { - if ( ! hasFinalizedItems() ) { - // if there's nothing here, don't bother allocating a new list - return Collections.emptyList(); - } else { - // pass reservoir by reference rather than make a copy, for speed - final List downsampledItems = reservoir; - downsampledItems.addAll(undiscardableItems); - clearItems(); - return downsampledItems; - } - } - - @Override - public boolean hasPendingItems() { - return false; - } - - @Override - public T peekFinalized() { - return ! reservoir.isEmpty() ? reservoir.get(0) : (! undiscardableItems.isEmpty() ? undiscardableItems.get(0) : null); - } - - @Override - public T peekPending() { - return null; - } - - @Override - public int size() { - return reservoir.size() + undiscardableItems.size(); - } - - @Override - public void signalEndOfInput() { - // NO-OP - } - - /** - * Clear the data structures used to hold information - */ - @Override - public void clearItems() { - // if we aren't expecting many overflows, allocate a linked list not an arraylist - reservoir = expectFewOverflows ? new LinkedList() : new ArrayList(targetSampleSize); - - // there's no possibility of overflow with the undiscardable items, so we always use a linked list for them - undiscardableItems = new LinkedList<>(); - - // it's a linked list if we allocate one - isLinkedList = expectFewOverflows; - - // an internal stat used by the downsampling process, so not cleared by resetStats() below - totalDiscardableReadsSeen = 0; - } - - @Override - public boolean requiresCoordinateSortOrder() { - return false; - } - - @Override - public void signalNoMoreReadsBefore( T read ) { - // NO-OP - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerFactory.java deleted file mode 100644 index c825bae1f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * Factory for creating ReservoirDownsamplers on demand - * - * @author David Roazen - */ -public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { - - private int targetSampleSize; - - public ReservoirDownsamplerFactory( int targetSampleSize ) { - this.targetSampleSize = targetSampleSize; - } - - public ReadsDownsampler newInstance() { - return new ReservoirDownsampler(targetSampleSize); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsampler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsampler.java deleted file mode 100644 index af0aa54c0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsampler.java +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -import java.util.*; - -/** - * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage - * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. - * - * @author David Roazen - */ -public class SimplePositionalDownsampler extends ReadsDownsampler { - - private final int targetCoverage; - - private final ReservoirDownsampler reservoir; - - private int currentContigIndex; - - private int currentAlignmentStart; - - private boolean positionEstablished; - - private boolean unmappedReadsReached; - - private ArrayList finalizedReads; - - - /** - * Construct a SimplePositionalDownsampler - * - * @param targetCoverage Maximum number of reads that may share any given alignment start position - */ - public SimplePositionalDownsampler( final int targetCoverage ) { - this.targetCoverage = targetCoverage; - reservoir = new ReservoirDownsampler(targetCoverage); - finalizedReads = new ArrayList(); - clearItems(); - resetStats(); - } - - @Override - public void submit( final T newRead ) { - updatePositionalState(newRead); - - if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream - finalizedReads.add(newRead); - } - else { - final int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); - // our reservoir downsampler will call doNotDiscardItem() for us to exclude items from elimination as appropriate - reservoir.submit(newRead); - numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; - } - } - - @Override - public boolean hasFinalizedItems() { - return finalizedReads.size() > 0; - } - - @Override - public List consumeFinalizedItems() { - // pass by reference rather than make a copy, for speed - final List toReturn = finalizedReads; - finalizedReads = new ArrayList(); - return toReturn; - } - - @Override - public boolean hasPendingItems() { - return reservoir.hasFinalizedItems(); - } - - @Override - public T peekFinalized() { - return finalizedReads.isEmpty() ? null : finalizedReads.get(0); - } - - @Override - public T peekPending() { - return reservoir.peekFinalized(); - } - - @Override - public int size() { - return finalizedReads.size() + reservoir.size(); - } - - @Override - public void signalEndOfInput() { - finalizeReservoir(); - } - - @Override - public void clearItems() { - reservoir.clearItems(); - reservoir.resetStats(); - finalizedReads.clear(); - positionEstablished = false; - unmappedReadsReached = false; - } - - @Override - public boolean requiresCoordinateSortOrder() { - return true; - } - - @Override - public void signalNoMoreReadsBefore( final T read ) { - updatePositionalState(read); - } - - private void updatePositionalState( final T newRead ) { - if ( readIsPastCurrentPosition(newRead) ) { - if ( reservoir.hasFinalizedItems() ) { - finalizeReservoir(); - } - - setCurrentPosition(newRead); - - if ( newRead.getReadUnmappedFlag() ) { - unmappedReadsReached = true; - } - } - } - - private void setCurrentPosition( final T read ) { - currentContigIndex = read.getReferenceIndex(); - currentAlignmentStart = read.getAlignmentStart(); - positionEstablished = true; - } - - private boolean readIsPastCurrentPosition( final T read ) { - return ! positionEstablished || - read.getReferenceIndex() > currentContigIndex || - read.getAlignmentStart() > currentAlignmentStart || - (read.getReadUnmappedFlag() && ! unmappedReadsReached); - } - - private void finalizeReservoir() { - finalizedReads.addAll(reservoir.consumeFinalizedItems()); - reservoir.resetStats(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerFactory.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerFactory.java deleted file mode 100644 index 3fc66cafe..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerFactory.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMRecord; - -/** - * Factory for creating SimplePositionalDownsamplers on demand - * - * @author David Roazen - */ -public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { - - private int targetCoverage; - - public SimplePositionalDownsamplerFactory( int targetCoverage ) { - this.targetCoverage = targetCoverage; - } - - public ReadsDownsampler newInstance() { - return new SimplePositionalDownsampler(targetCoverage); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java deleted file mode 100644 index 293bb1ce5..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/LinearMicroScheduler.java +++ /dev/null @@ -1,130 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.executive; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.io.DirectOutputTracker; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.traversals.TraversalEngine; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; - -import java.util.Collection; - - -/** A micro-scheduling manager for single-threaded execution of a traversal. */ -public class LinearMicroScheduler extends MicroScheduler { - - /** - * A direct output tracker for directly managing output. - */ - private DirectOutputTracker outputTracker = new DirectOutputTracker(); - - /** - * Create a new linear microscheduler to process the given reads and reference. - * - * @param walker Walker for the traversal. - * @param reads Reads file(s) to process. - * @param reference Reference for driving the traversal. - * @param rods Reference-ordered data. - */ - protected LinearMicroScheduler(final GenomeAnalysisEngine engine, - final Walker walker, - final SAMDataSource reads, - final IndexedFastaSequenceFile reference, - final Collection rods, - final ThreadAllocation threadAllocation) { - super(engine, walker, reads, reference, rods, threadAllocation); - - if ( threadAllocation.monitorThreadEfficiency() ) - setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); - } - - /** - * Run this traversal over the specified subsection of the dataset. - * - * @param walker Computation to perform over dataset. - * @param shardStrategy A strategy for sharding the data. - */ - public Object execute(Walker walker, Iterable shardStrategy) { - super.startingExecution(); - walker.initialize(); - Accumulator accumulator = Accumulator.create(engine,walker); - - boolean done = walker.isDone(); - int counter = 0; - - final TraversalEngine traversalEngine = borrowTraversalEngine(this); - for (Shard shard : shardStrategy ) { - if ( abortExecution() || done || shard == null ) // we ran out of shards that aren't owned - break; - - if(shard.getShardType() == Shard.ShardType.LOCUS) { - WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), - getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); - for(WindowMaker.WindowMakerIterator iterator: windowMaker) { - ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); - Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); - accumulator.accumulate(dataProvider,result); - dataProvider.close(); - if ( walker.isDone() ) break; - } - windowMaker.close(); - } - else { - ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); - Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); - accumulator.accumulate(dataProvider,result); - dataProvider.close(); - } - - done = walker.isDone(); - } - - Object result = accumulator.finishTraversal(); - - outputTracker.close(); - returnTraversalEngine(this, traversalEngine); - cleanup(); - executionIsDone(); - - return accumulator; - } - - /** - * @{inheritDoc} - */ - public OutputTracker getOutputTracker() { return outputTracker; } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java deleted file mode 100644 index e192b9a72..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/MicroScheduler.java +++ /dev/null @@ -1,463 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.executive; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.ReadMetrics; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.iterators.NullSAMIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.traversals.*; -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.AutoFormattingTime; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; - -import javax.management.JMException; -import javax.management.MBeanServer; -import javax.management.ObjectName; -import java.io.File; -import java.lang.management.ManagementFactory; -import java.util.*; - - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Apr 26, 2009 - * Time: 12:37:23 PM - * - * General base class for all scheduling algorithms - * Shards and schedules data in manageable chunks. - * - * Creates N TraversalEngines for each data thread for the MicroScheduler. This is necessary - * because in the HMS case you have multiple threads executing a traversal engine independently, and - * these engines may need to create separate resources for efficiency or implementation reasons. For example, - * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific. - * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have - * N data threads x M nano threads => N * M threads total. These are borrowed from this microscheduler - * and returned when done. Also allows us to tracks all created traversal engines so this microscheduler - * can properly shut them all down when the scheduling is done. - * - */ -public abstract class MicroScheduler implements MicroSchedulerMBean { - protected static final Logger logger = Logger.getLogger(MicroScheduler.class); - - /** - * The list of all Traversal engines we've created in this micro scheduler - */ - final List allCreatedTraversalEngines = new LinkedList(); - - /** - * All available engines. Engines are borrowed and returned when a subclass is actually - * going to execute the engine on some data. This allows us to have N copies for - * N data parallel executions, but without the dangerous code of having local - * ThreadLocal variables. - */ - final LinkedList availableTraversalEngines = new LinkedList(); - - /** - * Engines that have been allocated to a key already. - */ - final HashMap allocatedTraversalEngines = new HashMap(); - - /** - * Counts the number of instances of the class that are currently alive. - */ - private static int instanceNumber = 0; - - /** - * The engine invoking this scheduler. - */ - protected final GenomeAnalysisEngine engine; - - protected final IndexedFastaSequenceFile reference; - - private final SAMDataSource reads; - protected final Collection rods; - - private final MBeanServer mBeanServer; - private final ObjectName mBeanName; - - /** - * Threading efficiency monitor for tracking the resource utilization of the GATK - * - * may be null - */ - ThreadEfficiencyMonitor threadEfficiencyMonitor = null; - - /** - * MicroScheduler factory function. Create a microscheduler appropriate for reducing the - * selected walker. - * - * @param walker Which walker to use. - * @param reads the informations associated with the reads - * @param reference the reference file - * @param rods the rods to include in the traversal - * @param threadAllocation Number of threads to utilize. - * - * @return The best-fit microscheduler. - */ - public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if ( threadAllocation.isRunningInParallelMode() ) { - logger.info(String.format("Running the GATK in parallel mode with %d total threads, " + - "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine", - threadAllocation.getTotalNumThreads(), - threadAllocation.getNumCPUThreadsPerDataThread(), - threadAllocation.getNumDataThreads(), - Runtime.getRuntime().availableProcessors())); - if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() ) - logger.warn(String.format("Number of requested GATK threads %d is more than the number of " + - "available processors on this machine %d", threadAllocation.getTotalNumThreads(), - Runtime.getRuntime().availableProcessors())); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - if (walker.isReduceByInterval()) - throw new UserException.BadArgumentValue("nt", String.format("This run of %s is set up to aggregate results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option or check if this tool has an option to disable per-interval calculations.", engine.getWalkerName(walker.getClass()))); - - if ( ! (walker instanceof TreeReducible) ) { - throw badNT("nt", engine, walker); - } - } - - if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) { - throw badNT("nct", engine, walker); - } - - if ( threadAllocation.getNumDataThreads() > 1 ) { - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } else { - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); - } - } - - private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue(parallelArg, - String.format("The analysis %s currently does not support parallel execution with %s. " + - "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); - } - - /** - * Create a microscheduler given the reads and reference. - * - * @param walker the walker to execute with - * @param reads The reads. - * @param reference The reference. - * @param rods the rods to include in the traversal - * @param threadAllocation the allocation of threads to use in the underlying traversal - */ - protected MicroScheduler(final GenomeAnalysisEngine engine, - final Walker walker, - final SAMDataSource reads, - final IndexedFastaSequenceFile reference, - final Collection rods, - final ThreadAllocation threadAllocation) { - this.engine = engine; - this.reads = reads; - this.reference = reference; - this.rods = rods; - - final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog; - - // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation, - // and adds it to the list of created engines for later shutdown. - for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) { - final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation); - allCreatedTraversalEngines.add(traversalEngine); - availableTraversalEngines.add(traversalEngine); - } - - // Create the progress meter, and register it with the analysis engine - engine.registerProgressMeter(new ProgressMeter(progressLogFile, - availableTraversalEngines.peek().getTraversalUnits(), - engine.getRegionsOfGenomeBeingProcessed())); - - // Now that we have a progress meter, go through and initialize the traversal engines - for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines ) - traversalEngine.initialize(engine, walker, engine.getProgressMeter()); - - // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean. - // To get around this limitation and since we have no job identifier at this point, register a simple counter that - // will count the number of instances of this object that have been created in this JVM. - int thisInstance = instanceNumber++; - mBeanServer = ManagementFactory.getPlatformMBeanServer(); - try { - mBeanName = new ObjectName("org.broadinstitute.gatk.engine.executive:type=MicroScheduler,instanceNumber="+thisInstance); - mBeanServer.registerMBean(this, mBeanName); - } - catch (JMException ex) { - throw new ReviewedGATKException("Unable to register microscheduler with JMX", ex); - } - } - - /** - * Really make us a traversal engine of the appropriate type for walker and thread allocation - * - * @return a non-null uninitialized traversal engine - */ - @Ensures("result != null") - private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) { - if (walker instanceof ReadWalker) { - return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof LocusWalker) { - return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()); - } else if (walker instanceof DuplicateWalker) { - return new TraverseDuplicates(); - } else if (walker instanceof ReadPairWalker) { - return new TraverseReadPairs(); - } else if (walker instanceof ActiveRegionWalker) { - return new TraverseActiveRegions(threadAllocation.getNumCPUThreadsPerDataThread()); - } else { - throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); - } - } - - - /** - * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one - * - * @return the monitor, or null if none is active - */ - public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() { - return threadEfficiencyMonitor; - } - - /** - * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses - * - * @param threadEfficiencyMonitor - */ - public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) { - this.threadEfficiencyMonitor = threadEfficiencyMonitor; - } - - /** - * Should we stop all execution work and exit gracefully? - * - * Returns true in the case where some external signal or time limit has been received, indicating - * that this GATK shouldn't continue executing. This isn't a kill signal, it is really a "shutdown - * gracefully at the next opportunity" signal. Concrete implementations of the MicroScheduler - * examine this value as often as reasonable and, if it returns true, stop what they are doing - * at the next available opportunity, shutdown their resources, call notify done, and return. - * - * @return true if we should abort execution, or false otherwise - */ - protected boolean abortExecution() { - final boolean abort = engine.exceedsRuntimeLimit(); - if ( abort ) { - final AutoFormattingTime aft = new AutoFormattingTime(engine.getRuntimeLimitInNanoseconds(), -1, 4); - logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft); - } - return abort; - } - - /** - * Walks a walker over the given list of intervals. - * - * @param walker Computation to perform over dataset. - * @param shardStrategy A strategy for sharding the data. - * - * @return the return type of the walker - */ - public abstract Object execute(Walker walker, Iterable shardStrategy); - - /** - * Tells this MicroScheduler that the execution of one of the subclass of this object as started - * - * Must be called when the implementation of execute actually starts up - * - * Currently only starts the progress meter timer running, but other start up activities could be incorporated - */ - protected void startingExecution() { - engine.getProgressMeter().start(); - } - - /** - * Retrieves the object responsible for tracking and managing output. - * @return An output tracker, for loading data in and extracting results. Will not be null. - */ - public abstract OutputTracker getOutputTracker(); - - /** - * Gets the an iterator over the given reads, which will iterate over the reads in the given shard. - * @param shard the shard to use when querying reads. - * @return an iterator over the reads specified in the shard. - */ - protected GATKSAMIterator getReadIterator(Shard shard) { - return (!reads.isEmpty()) ? reads.seek(shard) : new NullSAMIterator(); - } - - /** - * Must be called by subclasses when execute is done - */ - protected void executionIsDone() { - engine.getProgressMeter().notifyDone(engine.getCumulativeMetrics().getNumIterations()); - printReadFilteringStats(); - shutdownTraversalEngines(); - - // Print out the threading efficiency of this HMS, if state monitoring is enabled - if ( threadEfficiencyMonitor != null ) { - // include the master thread information - threadEfficiencyMonitor.threadIsDone(Thread.currentThread()); - threadEfficiencyMonitor.printUsageInformation(logger); - } - } - - /** - * Shutdown all of the created engines, and clear the list of created engines, dropping - * pointers to the traversal engines - */ - public synchronized void shutdownTraversalEngines() { - for ( final TraversalEngine te : allCreatedTraversalEngines) - te.shutdown(); - - allCreatedTraversalEngines.clear(); - availableTraversalEngines.clear(); - } - - /** - * Prints out information about number of reads observed and filtering, if any reads were used in the traversal - * - * Looks like: - * - * INFO 10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%) - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing BadMateFilter - * INFO 10:40:47,370 MicroScheduler - -> 20 reads (19.80% of total) failing DuplicateReadFilter - * INFO 10:40:47,370 MicroScheduler - -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter - */ - private void printReadFilteringStats() { - final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); - if ( cumulativeMetrics.getNumReadsSeen() > 0 ) { - // count up the number of skipped reads by summing over all filters - long nSkippedReads = 0L; - for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) - nSkippedReads += countsByFilter; - - logger.info(String.format("%d reads were filtered out during the traversal out of approximately %d total reads (%.2f%%)", - nSkippedReads, - cumulativeMetrics.getNumReadsSeen(), - 100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen()))); - - for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { - long count = filterCounts.getValue(); - logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); - } - } - } - - /** - * Gets the engine that created this microscheduler. - * @return The engine owning this microscheduler. - */ - public GenomeAnalysisEngine getEngine() { return engine; } - - /** - * Returns data source maintained by this scheduler - * @return - */ - public SAMDataSource getSAMDataSource() { return reads; } - - /** - * Returns the reference maintained by this scheduler. - * @return The reference maintained by this scheduler. - */ - public IndexedFastaSequenceFile getReference() { return reference; } - - protected void cleanup() { - try { - mBeanServer.unregisterMBean(mBeanName); - } - catch (JMException ex) { - throw new ReviewedGATKException("Unable to unregister microscheduler with JMX", ex); - } - } - - /** - * Returns a traversal engine suitable for use, associated with key - * - * Key is an arbitrary object that is used to retrieve the same traversal - * engine over and over. This can be important in the case where the - * traversal engine has data associated with it in some other context, - * and we need to ensure that the context always sees the same traversal - * engine. This happens in the HierarchicalMicroScheduler, where you want - * the a thread executing traversals to retrieve the same engine each time, - * as outputs are tracked w.r.t. that engine. - * - * If no engine is associated with key yet, pops the next available engine - * from the available ones maintained by this - * microscheduler. Note that it's a runtime error to pop a traversal engine - * from this scheduler if there are none available. Callers that - * once pop'd an engine for use must return it with returnTraversalEngine - * - * @param key the key to associate with this engine - * @return a non-null TraversalEngine suitable for execution in this scheduler - */ - @Ensures("result != null") - protected synchronized TraversalEngine borrowTraversalEngine(final Object key) { - if ( key == null ) throw new IllegalArgumentException("key cannot be null"); - - final TraversalEngine engine = allocatedTraversalEngines.get(key); - if ( engine == null ) { - if ( availableTraversalEngines.isEmpty() ) - throw new IllegalStateException("no traversal engines were available"); - allocatedTraversalEngines.put(key, availableTraversalEngines.pop()); - return allocatedTraversalEngines.get(key); - } else { - return engine; - } - } - - /** - * Return a borrowed traversal engine to this MicroScheduler, for later use - * in another traversal execution - * - * @param key the key used to id the engine, provided to the borrowTraversalEngine function - * @param traversalEngine the borrowed traversal engine. Must have been previously borrowed. - */ - protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) { - if ( traversalEngine == null ) - throw new IllegalArgumentException("Attempting to push a null traversal engine"); - if ( ! allCreatedTraversalEngines.contains(traversalEngine) ) - throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine); - if ( ! allocatedTraversalEngines.containsKey(key) ) - throw new IllegalArgumentException("No traversal engine was never checked out with key " + key); - - // note there's nothing to actually do here, but a function implementation - // might want to do something - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java deleted file mode 100644 index c8483298b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/executive/WindowMaker.java +++ /dev/null @@ -1,217 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.executive; - -import htsjdk.samtools.util.PeekableIterator; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.iterators.GATKSAMRecordIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.locusiterator.LocusIterator; -import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Transforms an iterator of reads which overlap the given interval list into an iterator of covered single-base loci - * completely contained within the interval list. To do this, it creates a LocusIteratorByState which will emit a single-bp - * locus for every base covered by the read iterator, then uses the WindowMakerIterator.advance() to filter down that stream of - * loci to only those covered by the given interval list. - * - * Example: - * Incoming stream of reads: A:chr20:1-5, B:chr20:2-6, C:chr20:2-7, D:chr20:3-8, E:chr20:5-10 - * Incoming intervals: chr20:3-7 - * - * Locus iterator by state will produce the following stream of data: - * chr1:1 {A}, chr1:2 {A,B,C}, chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, - * chr1:6 {B,C,D,E}, chr1:7 {C,D,E}, chr1:8 {D,E}, chr1:9 {E}, chr1:10 {E} - * - * WindowMakerIterator will then filter the incoming stream, emitting the following stream: - * chr1:3 {A,B,C,D}, chr1:4 {A,B,C,D}, chr1:5 {A,B,C,D,E}, chr1:6 {B,C,D,E}, chr1:7 {C,D,E} - * - * @author mhanna - * @version 0.1 - */ -public class WindowMaker implements Iterable, Iterator { - /** - * Source information for iteration. - */ - private final ReadProperties sourceInfo; - - /** - * Hold the read iterator so that it can be closed later. - */ - private final GATKSAMRecordIterator readIterator; - - /** - * The data source for reads. Will probably come directly from the BAM file. - */ - private final PeekableIterator sourceIterator; - - /** - * Stores the sequence of intervals that the windowmaker should be tracking. - */ - private final PeekableIterator intervalIterator; - - /** - * In the case of monolithic sharding, this case returns whether the only shard has been generated. - */ - private boolean shardGenerated = false; - - /** - * The alignment context to return from this shard's iterator. Lazy implementation: the iterator will not find the - * currentAlignmentContext until absolutely required to do so. If currentAlignmentContext is null and advance() - * doesn't populate it, no more elements are available. If currentAlignmentContext is non-null, currentAlignmentContext - * should be returned by next(). - */ - private AlignmentContext currentAlignmentContext; - - /** - * Create a new window maker with the given iterator as a data source, covering - * the given intervals. - * @param iterator The data source for this window. - * @param intervals The set of intervals over which to traverse. - * @param sampleNames The complete set of sample names in the reads in shard - */ - - private final LocusIteratorByState libs; - - public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals, Collection sampleNames) { - this.sourceInfo = shard.getReadProperties(); - this.readIterator = new GATKSAMRecordIterator(iterator); - - this.libs = new LocusIteratorByState(readIterator,sourceInfo,genomeLocParser,sampleNames); - this.sourceIterator = new PeekableIterator(libs); - - this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; - } - - public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, GATKSAMIterator iterator, List intervals ) { - this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); - } - - public Iterator iterator() { - return this; - } - - public boolean hasNext() { - return (intervalIterator != null && intervalIterator.hasNext()) || !shardGenerated; - } - - public WindowMakerIterator next() { - shardGenerated = true; - return new WindowMakerIterator(intervalIterator != null ? intervalIterator.next() : null); - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a window maker."); - } - - public void close() { - this.readIterator.close(); - } - - public class WindowMakerIterator extends LocusIterator { - /** - * The locus for which this iterator is currently returning reads. - */ - private final GenomeLoc locus; - - public WindowMakerIterator(GenomeLoc locus) { - this.locus = locus; - advance(); - } - - public ReadProperties getSourceInfo() { - return sourceInfo; - } - - public GenomeLoc getLocus() { - return locus; - } - - public WindowMakerIterator iterator() { - return this; - } - - public boolean hasNext() { - advance(); - return currentAlignmentContext != null; - } - - public AlignmentContext next() { - if(!hasNext()) throw new NoSuchElementException("WindowMakerIterator is out of elements for this interval."); - - // Consume this alignment context. - AlignmentContext toReturn = currentAlignmentContext; - currentAlignmentContext = null; - - // Return the current element. - return toReturn; - } - - private void advance() { - // Need to find the next element that is not past shard boundaries. If we travel past the edge of - // shard boundaries, stop and let the next interval pick it up. - while(currentAlignmentContext == null && sourceIterator.hasNext()) { - // Advance the iterator and try again. - AlignmentContext candidateAlignmentContext = sourceIterator.peek(); - - if(locus == null) { - // No filter present. Return everything that LocusIteratorByState provides us. - currentAlignmentContext = sourceIterator.next(); - } - else if(locus.isPast(candidateAlignmentContext.getLocation())) - // Found a locus before the current window; claim this alignment context and throw it away. - sourceIterator.next(); - else if(locus.containsP(candidateAlignmentContext.getLocation())) { - // Found a locus within the current window; claim this alignment context and call it the next entry. - currentAlignmentContext = sourceIterator.next(); - } - else if(locus.isBefore(candidateAlignmentContext.getLocation())) { - // Whoops. Skipped passed the end of the region. Iteration for this window is complete. Do - // not claim this alignment context in case it is part of the next shard. - break; - } - else - throw new ReviewedGATKException("BUG: filtering locus does not contain, is not before, and is not past the given alignment context"); - } - } - - @Override - public LocusIteratorByState getLIBS() { - return libs; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java deleted file mode 100644 index 59c3f151b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/FilterManager.java +++ /dev/null @@ -1,95 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.help.GATKDocUtils; -import org.broadinstitute.gatk.utils.help.HelpConstants; - -import java.util.Collection; -import java.util.List; - -/** - * Manage filters and filter options. Any requests for basic filtering classes - * should ultimately be made through this class. - * - * @author mhanna - * @version 0.1 - */ -public class FilterManager extends PluginManager { - public FilterManager() { - super(ReadFilter.class,"filter","Filter"); - } - - /** - * Instantiate a filter of the given type. Along the way, scream bloody murder if - * the filter is not available. - * @param filterType The type of the filter - * @return The filter - */ - public ReadFilter createFilterByType(Class filterType) { - return this.createByName(getName(filterType)); - } - - public Collection> getValues() { - return this.getPlugins(); - } - - /** - * Rather than use the default error message, print out a list of read filters as well. - * @param pluginCategory - string, the category of the plugin (e.g. read filter) - * @param pluginName - string, what we were trying to match (but failed to) - * @return - A wall of text with the default message, followed by a listing of available read filters - */ - @Override - protected String formatErrorMessage(String pluginCategory, String pluginName) { - List> availableFilters = this.getPluginsImplementing(ReadFilter.class); - - - return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, - userFriendlyListofReadFilters(availableFilters), - "Please consult the GATK Documentation (" + HelpConstants.GATK_DOCS_URL + ") for more information."); - } - - private String userFriendlyListofReadFilters(List> filters) { - final String headName = "FilterName", headDoc = "Documentation"; - int longestNameLength = -1; - for ( Class < ? extends ReadFilter> filter : filters ) { - longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); - } - String format = " %"+longestNameLength+"s %s%n"; - - StringBuilder listBuilder = new StringBuilder(); - listBuilder.append(String.format(format,headName,headDoc)); - for ( Class filter : filters ) { - String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); - String filterName = this.getName(filter); - listBuilder.append(String.format(format,filterName,helpLink)); - } - - return listBuilder.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java deleted file mode 100644 index 1b59a06d8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilter.java +++ /dev/null @@ -1,260 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.Collections; - -/** - * Filter out malformed reads. - * - * @author mhanna - * @version 0.1 - */ -public class MalformedReadFilter extends ReadFilter { - - - private static final String FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME = "filter_reads_with_N_cigar" ; - - private SAMFileHeader header; - - @Argument(fullName = FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME, shortName = "filterRNC", doc = "filter out reads with CIGAR containing the N operator, instead of stop processing and report an error.", required = false) - boolean filterReadsWithNCigar = false; - - - @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false) - boolean filterMismatchingBaseAndQuals = false; - - @Argument(fullName = "filter_bases_not_stored", shortName = "filterNoBases", doc = "if a read has no stored bases (i.e. a '*'), filter out the read instead of blowing up.", required = false) - boolean filterBasesNotStored = false; - - /** - * Indicates the applicable validation exclusions - */ - private boolean allowNCigars; - - @Override - public void initialize(final GenomeAnalysisEngine engine) { - header = engine.getSAMFileHeader(); - ValidationExclusion validationExclusions = null; - final SAMDataSource rds = engine.getReadsDataSource(); - if (rds != null) { - final ReadProperties rps = rds.getReadsInfo(); - if (rps != null) { - validationExclusions = rps.getValidationExclusionList(); - } - } - if (validationExclusions == null) { - allowNCigars = false; - } else { - allowNCigars = validationExclusions.contains(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS); - } - } - - public boolean filterOut(final SAMRecord read) { - // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided - return !checkInvalidAlignmentStart(read) || - !checkInvalidAlignmentEnd(read) || - !checkAlignmentDisagreesWithHeader(this.header,read) || - !checkHasReadGroup(read) || - !checkMismatchingBasesAndQuals(read, filterMismatchingBaseAndQuals) || - !checkCigarDisagreesWithAlignment(read) || - !checkSeqStored(read, filterBasesNotStored) || - !checkCigarIsSupported(read,filterReadsWithNCigar,allowNCigars); - } - - private static boolean checkHasReadGroup(final SAMRecord read) { - if ( read.getReadGroup() == null ) { - // there are 2 possibilities: either the RG tag is missing or it is not defined in the header - final String rgID = (String)read.getAttribute(SAMTagUtil.getSingleton().RG); - if ( rgID == null ) - throw new UserException.ReadMissingReadGroup(read); - throw new UserException.ReadHasUndefinedReadGroup(read, rgID); - } - return true; - } - - /** - * Check for the case in which the alignment start is inconsistent with the read unmapped flag. - * @param read The read to validate. - * @return true if read start is valid, false otherwise. - */ - private static boolean checkInvalidAlignmentStart(final SAMRecord read ) { - // read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START - if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START ) - return false; - // Read is not flagged as 'unmapped', but alignment start is -1 - if( !read.getReadUnmappedFlag() && read.getAlignmentStart() == -1 ) - return false; - return true; - } - - /** - * Check for invalid end of alignments. - * @param read The read to validate. - * @return true if read end is valid, false otherwise. - */ - private static boolean checkInvalidAlignmentEnd(final SAMRecord read ) { - // Alignment aligns to negative number of bases in the reference. - if( !read.getReadUnmappedFlag() && read.getAlignmentEnd() != -1 && (read.getAlignmentEnd()-read.getAlignmentStart()+1)<0 ) - return false; - return true; - } - - /** - * Check to ensure that the alignment makes sense based on the contents of the header. - * @param header The SAM file header. - * @param read The read to verify. - * @return true if alignment agrees with header, false othrewise. - */ - private static boolean checkAlignmentDisagreesWithHeader(final SAMFileHeader header, final SAMRecord read ) { - // Read is aligned to nonexistent contig - if( read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START ) - return false; - final SAMSequenceRecord contigHeader = header.getSequence( read.getReferenceIndex() ); - // Read is aligned to a point after the end of the contig - if( !read.getReadUnmappedFlag() && read.getAlignmentStart() > contigHeader.getSequenceLength() ) - return false; - return true; - } - - /** - * Check for inconsistencies between the cigar string and the - * @param read The read to validate. - * @return true if cigar agrees with alignment, false otherwise. - */ - private static boolean checkCigarDisagreesWithAlignment(final SAMRecord read) { - // Read has a valid alignment start, but the CIGAR string is empty - if( !read.getReadUnmappedFlag() && - read.getAlignmentStart() != -1 && - read.getAlignmentStart() != SAMRecord.NO_ALIGNMENT_START && - read.getAlignmentBlocks().size() < 0 ) - return false; - return true; - } - - /** - * Check for unsupported CIGAR operators. - * Currently the N operator is not supported. - * @param read The read to validate. - * @param filterReadsWithNCigar whether the offending read should just - * be silently filtered or not. - * @param allowNCigars whether reads that contain N operators in their CIGARs - * can be processed or an exception should be thrown instead. - * @throws UserException.UnsupportedCigarOperatorException - * if {@link #filterReadsWithNCigar} is false and - * the input read has some unsupported operation. - * @return true if the read CIGAR operations are - * fully supported, otherwise false, as long as - * no exception has been thrown. - */ - private static boolean checkCigarIsSupported(final SAMRecord read, final boolean filterReadsWithNCigar, final boolean allowNCigars) { - if( containsNOperator(read)) { - if (! filterReadsWithNCigar && !allowNCigars) { - throw new UserException.UnsupportedCigarOperatorException( - CigarOperator.N,read, - "Perhaps you are" - + " trying to use RNA-Seq data?" - + " While we are currently actively working to" - + " support this data type unfortunately the" - + " GATK cannot be used with this data in its" - + " current form. You have the option of either" - + " filtering out all reads with operator " - + CigarOperator.N + " in their CIGAR string" - + " (please add --" - + FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME - + " to your command line) or" - + " assume the risk of processing those reads as they" - + " are including the pertinent unsafe flag (please add -U" - + ' ' + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS - + " to your command line). Notice however that if you were" - + " to choose the latter, an unspecified subset of the" - + " analytical outputs of an unspecified subset of the tools" - + " will become unpredictable. Consequently the GATK team" - + " might well not be able to provide you with the usual support" - + " with any issue regarding any output"); - } - return ! filterReadsWithNCigar; - } - return true; - } - - private static boolean containsNOperator(final SAMRecord read) { - final Cigar cigar = read.getCigar(); - if (cigar == null) { - return false; - } - for (final CigarElement ce : cigar.getCigarElements()) { - if (ce.getOperator() == CigarOperator.N) { - return true; - } - } - return false; - } - - /** - * Check if the read has the same number of bases and base qualities - * @param read the read to validate - * @return true if they have the same number. False otherwise. - */ - private static boolean checkMismatchingBasesAndQuals(final SAMRecord read, final boolean filterMismatchingBaseAndQuals) { - final boolean result; - if (read.getReadLength() == read.getBaseQualities().length) - result = true; - else if (filterMismatchingBaseAndQuals) - result = false; - else - throw new UserException.MalformedBAM(read, - String.format("BAM file has a read with mismatching number of bases and base qualities. Offender: %s [%d bases] [%d quals].%s", - read.getReadName(), read.getReadLength(), read.getBaseQualities().length, - read.getBaseQualities().length == 0 ? " You can use --defaultBaseQualities to assign a default base quality for all reads, but this can be dangerous in you don't know what you are doing." : "")); - - return result; - } - - /** - * Check if the read has its base sequence stored - * @param read the read to validate - * @return true if the sequence is stored and false otherwise ("*" in the SEQ field). - */ - protected static boolean checkSeqStored(final SAMRecord read, final boolean filterBasesNotStored) { - - if ( read.getReadBases() != SAMRecord.NULL_SEQUENCE ) - return true; - - if ( filterBasesNotStored ) - return false; - - throw new UserException.MalformedBAM(read, String.format("the BAM file has a read with no stored bases (i.e. it uses '*') which is not supported in the GATK; see the --filter_bases_not_stored argument. Offender: %s", read.getReadName())); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/GATKSAMFileWriter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/GATKSAMFileWriter.java deleted file mode 100644 index c60aae842..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/GATKSAMFileWriter.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; - -/** - * A writer that will allow unsorted BAM files to be written - * and sorted on-the-fly. - * - * @author mhanna - * @version 0.1 - */ -public interface GATKSAMFileWriter extends SAMFileWriter { - /** - * Writes the given custom header to SAM file output. - * @param header The header to write. - */ - public void writeHeader(SAMFileHeader header); - - /** - * Set Whether the BAM file to create is actually presorted. - * @param presorted True if the BAM file is presorted. False otherwise. - */ - public void setPresorted(boolean presorted); - - /** - * Set how many records in RAM the BAM file stores when sorting on-the-fly. - * @param maxRecordsInRam Max number of records in RAM. - */ - public void setMaxRecordsInRam(int maxRecordsInRam); -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java deleted file mode 100644 index c4f776915..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/storage/VariantContextWriterStorage.java +++ /dev/null @@ -1,228 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.storage; - -import htsjdk.samtools.util.BlockCompressedOutputStream; -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import htsjdk.variant.bcf2.BCF2Utils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.writer.Options; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; -import htsjdk.variant.vcf.VCFHeader; - -import java.io.*; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.List; - -/** - * Provides temporary and permanent storage for genotypes in VCF format. - * - * @author mhanna - * @version 0.1 - */ -public class VariantContextWriterStorage implements Storage, VariantContextWriter { - /** - * our log, which we want to capture anything from this class - */ - private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class); - - private final static int BUFFER_SIZE = 1048576; - - protected final File file; - protected OutputStream stream; - protected final VariantContextWriter writer; - boolean closed = false; - - /** - * Constructs an object which will write directly into the output file provided by the stub. - * Intentionally delaying the writing of the header -- this should be filled in by the walker. - * - * Respecs the isCompressed() request in stub, so if isCompressed() is true then this - * will create a storage output that dumps output to a BlockCompressedOutputStream. - * - * @param stub Stub to use when constructing the output file. - */ - public VariantContextWriterStorage(VariantContextWriterStub stub) { - if ( stub.getOutputFile() != null ) { - this.file = stub.getOutputFile(); - writer = vcfWriterToFile(stub,stub.getOutputFile(),true,true); - } - else if ( stub.getOutputStream() != null ) { - this.file = null; - this.stream = stub.getOutputStream(); - writer = VariantContextWriterFactory.create(stream, - stub.getMasterSequenceDictionary(), stub.getWriterOptions(false)); - } - else - throw new ReviewedGATKException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); - } - - /** - * Constructs an object which will redirect into a different file. - * - * Note that this function does not respect the isCompressed() request from the stub, in order - * to ensure that tmp. files can be read back in by the Tribble system, and merged with the mergeInto function. - * - * @param stub Stub to use when synthesizing file / header info. - * @param tempFile File into which to direct the output data. - */ - public VariantContextWriterStorage(VariantContextWriterStub stub, File tempFile) { - //logger.debug("Creating temporary output file " + tempFile.getAbsolutePath() + " for VariantContext output."); - this.file = tempFile; - this.writer = vcfWriterToFile(stub, file, false, false); - writer.writeHeader(stub.getVCFHeader()); - } - - /** - * common initialization routine for multiple constructors - * @param stub Stub to use when constructing the output file. - * @param file Target file into which to write VCF records. - * @param indexOnTheFly true to index the file on the fly. NOTE: will be forced to false for compressed files. - * @param allowCompressed if false, we won't compress the output, even if the stub requests it. Critical - * for creating temp. output files that will be subsequently merged, as these do not - * support compressed output - * @return A VCF writer for use with this class - */ - private VariantContextWriter vcfWriterToFile(final VariantContextWriterStub stub, - final File file, - final boolean indexOnTheFly, - final boolean allowCompressed) { - try { - // we cannot merge compressed outputs, so don't compress if allowCompressed is false, - // which is the case when we have a temporary output file for later merging - if ( allowCompressed && stub.isCompressed() ) - stream = new BlockCompressedOutputStream(file); - else - stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE)); - } - catch(IOException ex) { - throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex); - } - - EnumSet options = stub.getWriterOptions(indexOnTheFly); - VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); - - // if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both - // TODO -- remove me when argument generateShadowBCF is removed - if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) { - final File bcfFile = BCF2Utils.shadowBCF(file); - if ( bcfFile != null ) { - FileOutputStream bcfStream; - try { - bcfStream = new FileOutputStream(bcfFile); - } catch (FileNotFoundException e) { - throw new RuntimeException(bcfFile + ": Unable to create BCF writer", e); - } - - VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, bcfStream, stub.getMasterSequenceDictionary(), stub.getIndexCreator(), options); - writer = new TestWriter(writer, bcfWriter); - } - } - - return writer; - } - - private final static class TestWriter implements VariantContextWriter { - final List writers; - - private TestWriter(final VariantContextWriter ... writers) { - this.writers = Arrays.asList(writers); - } - - @Override - public void writeHeader(final VCFHeader header) { - for ( final VariantContextWriter writer : writers ) writer.writeHeader(header); - } - - @Override - public void close() { - for ( final VariantContextWriter writer : writers ) writer.close(); - } - - @Override - public void add(final VariantContext vc) { - for ( final VariantContextWriter writer : writers ) writer.add(vc); - } - } - - public void add(VariantContext vc) { - if ( closed ) throw new ReviewedGATKException("Attempting to write to a closed VariantContextWriterStorage " + vc.getStart() + " storage=" + this); - writer.add(vc); - } - - /** - * initialize this VCF header - * - * @param header the header - */ - public void writeHeader(VCFHeader header) { - writer.writeHeader(header); - } - - /** - * Close the VCF storage object. - */ - public void close() { - writer.close(); - closed = true; - } - - public void mergeInto(VariantContextWriterStorage target) { - try { - if ( ! closed ) - throw new ReviewedGATKException("Writer not closed, but we are merging into the file!"); - final String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin"; - logger.debug(String.format("Merging VariantContextWriterStorage from %s into %s", file.getAbsolutePath(), targetFilePath)); - - // use the feature manager to determine the right codec for the tmp file - // that way we don't assume it's a specific type - final FeatureManager.FeatureDescriptor fd = new FeatureManager().getByFiletype(file); - if ( fd == null ) - throw new UserException.LocalParallelizationProblem(file); - - final FeatureCodec codec = fd.getCodec(); - final AbstractFeatureReader source = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), codec, false); - - for ( final Feature vc : source.iterator() ) { - target.writer.add((VariantContext) vc); - } - - source.close(); - file.delete(); // this should be last to aid in debugging when the process fails - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java deleted file mode 100644 index c45432471..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterArgumentTypeDescriptor.java +++ /dev/null @@ -1,106 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.samtools.SAMFileWriter; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.OutputStream; -import java.lang.reflect.Type; - -/** - * Insert a SAMFileWriterStub instead of a full-fledged concrete OutputStream implementations. - */ -public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { - - /** - * The engine into which output stubs should be fed. - */ - private final GenomeAnalysisEngine engine; - - /** - * The default location to which data should be written if the user specifies no such location. - */ - private final OutputStream defaultOutputStream; - - /** - * Create a new SAMFileWriter argument, notifying the given engine when that argument has been created. - * @param engine Engine to add SAMFileWriter output to. - * @param defaultOutputStream the target for the data - */ - public SAMFileWriterArgumentTypeDescriptor( GenomeAnalysisEngine engine, OutputStream defaultOutputStream ) { - this.engine = engine; - this.defaultOutputStream = defaultOutputStream; - } - - @Override - public boolean supports( Class type ) { - return SAMFileWriter.class.equals(type) || GATKSAMFileWriter.class.equals(type); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { - return !source.isRequired() && source.defaultsToStdout(); - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "stdout"; - } - - @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(source.isRequired() || !source.defaultsToStdout()) - throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); - SAMFileWriterStub stub = new SAMFileWriterStub(engine,defaultOutputStream); - engine.addOutput(stub); - return stub; - } - - @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { - // Extract all possible parameters that could be passed to a BAM file writer? - ArgumentDefinition bamArgumentDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue writerFileName = getArgumentValue( bamArgumentDefinition, matches ); - - // Create the stub - SAMFileWriterStub stub = null; // stub = new SAMFileWriterStub(engine, defaultOutputStream); - - if (writerFileName != null && writerFileName.asFile() != null ) { - stub = new SAMFileWriterStub(engine, writerFileName.asFile()); - - // WARNING: Side effects required by engine! - parsingEngine.addTags(stub,getArgumentTags(matches)); - engine.addOutput(stub); - } - - return stub; - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java deleted file mode 100644 index cc814e9e6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/SAMFileWriterStub.java +++ /dev/null @@ -1,336 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.ProgressLoggerInterface; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.utils.baq.BAQ; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.io.File; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.List; - -/** - * A stub for routing and management of SAM file reading and writing. - * - * @author mhanna - * @version 0.1 - */ -public class SAMFileWriterStub implements Stub, GATKSAMFileWriter { - /** - * Engine to use for collecting attributes for the output SAM file. - */ - private final GenomeAnalysisEngine engine; - - /** - * A header supplied by the user that overrides the merged header from the input BAM. - */ - private SAMFileHeader headerOverride = null; - - /** - * The sam file that this stub should write to. Should be passed along to - * whatever happens to create the StreamConnector. - */ - private final File samFile; - - /** - * The target output stream, to be used in place of the SAM file. - */ - private final OutputStream samOutputStream; - - /** - * The validation stringency to apply when reading this file. - */ - private Integer compressionLevel = null; - - /** - * Should the GATK index the output BAM on-the-fly? - */ - private boolean indexOnTheFly = false; - - /** - * Should the GATK generate an md5 for the output BAM? - */ - private boolean generateMD5 = false; - - /** - * Should this BAM be presorted? - */ - private boolean presorted = true; - - /** - * How many records should the BAM writer store in RAM while - * sorting the BAM on-the-fly? - */ - private Integer maxRecordsInRam = null; - - /** - * Connects this stub with an external stream capable of serving the - * requests of the consumer of this stub. - */ - private OutputTracker outputTracker = null; - - /** - * Has the write started? If so, throw an exception if someone tries to - * change write parameters to the file (compression level, presorted flag, - * header, etc). - */ - private boolean writeStarted = false; - - - /** - * HMM for BAQ, if needed - */ - BAQ baqHMM = new BAQ(); - - /** - * Should we simplify the BAM file while writing it out? - */ - private boolean simplifyBAM = false; - - private List onOutputReadTransformers = null; - - /** - * Create a new stub given the requested SAM file and compression level. - * @param engine source of header data, maybe other data about input files. - * @param samFile SAM file to (ultimately) create. - */ - public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) { - this(engine, samFile, null); - } - - /** - * Create a new stub given the requested SAM file and compression level. - * @param engine source of header data, maybe other data about input files. - * @param stream Output stream to which data should be written. - */ - public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) { - this(engine, null, stream); - } - - private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) { - this.engine = engine; - this.samFile = samFile; - this.samOutputStream = stream; - } - - /** - * Retrieves the SAM file to (ultimately) be created. - * @return The SAM file. Must not be null. - */ - public File getOutputFile() { - return samFile; - } - - public boolean simplifyBAM() { - return simplifyBAM; - } - - public void setSimplifyBAM(boolean v) { - simplifyBAM = v; - } - - public OutputStream getOutputStream() { - return samOutputStream; - } - - /** - * Retrieves the header to use when creating the new SAM file. - * @return header to use when creating the new SAM file. - */ - public SAMFileHeader getFileHeader() { - return headerOverride != null ? headerOverride : engine.getSAMFileHeader(); - } - - /** - * Retrieves the desired compression level for - * @return The current compression level. Could be null if the user doesn't care. - */ - public Integer getCompressionLevel() { - return compressionLevel; - } - - /** - * Sets the desired compression level. - * @param compressionLevel The suggested compression level. - */ - public void setCompressionLevel( Integer compressionLevel ) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the compression level of a file with alignments already in it."); - this.compressionLevel = compressionLevel; - } - - /** - * Gets whether to index this output stream on-the-fly. - * @return True means create an index. False means skip index creation. - */ - public Boolean getIndexOnTheFly() { - return indexOnTheFly; - } - - /** - * Controls whether to index this output stream on-the-fly. - * @param indexOnTheFly True means create an index. False means skip index creation. - */ - public void setIndexOnTheFly( boolean indexOnTheFly ) { - if(writeStarted) - throw new UserException("Attempted to index a BAM on the fly of a file with alignments already in it."); - this.indexOnTheFly = indexOnTheFly; - } - - /** - * Gets whether to generate an md5 on-the-fly for this BAM. - * @return True generates the md5. False means skip writing the file. - */ - public Boolean getGenerateMD5() { - return generateMD5; - } - - /** - * Gets whether to generate an md5 on-the-fly for this BAM. - * @param generateMD5 True generates the md5. False means skip writing the file. - */ - public void setGenerateMD5(boolean generateMD5) { - if(writeStarted) - throw new UserException("Attempted to turn on md5 generation for BAM file with alignments already in it."); - this.generateMD5 = generateMD5; - } - - /** - * Whether the BAM file to create is actually presorted. - * @return True if the BAM file is presorted. False otherwise. - */ - public boolean isPresorted() { - return this.presorted; - } - - /** - * Set Whether the BAM file to create is actually presorted. - * @param presorted True if the BAM file is presorted. False otherwise. - */ - public void setPresorted(boolean presorted) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the presorted state of a file with alignments already in it."); - this.presorted = presorted; - } - - /** - * Get the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. - * @return Max records in RAM, or null if unset. - */ - public Integer getMaxRecordsInRam() { - return this.maxRecordsInRam; - } - - /** - * Sets the maximum number of reads to hold in RAM when sorting a BAM on-the-fly. - * @param maxRecordsInRam Max number of records in RAM. - */ - public void setMaxRecordsInRam(int maxRecordsInRam) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the max records in RAM of a file with alignments already in it."); - this.maxRecordsInRam = maxRecordsInRam; - } - - /** - * Registers the given streamConnector with this stub. - * @param outputTracker The connector used to provide an appropriate stream. - */ - public void register( OutputTracker outputTracker ) { - this.outputTracker = outputTracker; - } - - @Override - public void processArguments( final GATKArgumentCollection argumentCollection ) { - if (argumentCollection.bamCompression != null) - setCompressionLevel(argumentCollection.bamCompression); - setGenerateMD5(argumentCollection.enableBAMmd5); - setIndexOnTheFly(!argumentCollection.disableBAMIndexing); - setSimplifyBAM(argumentCollection.simplifyBAM); - - } - - /** - * Use the given header as the target for this writer. - * @param header The header to write. - */ - public void writeHeader(SAMFileHeader header) { - if(writeStarted) - throw new ReviewedGATKException("Attempted to change the header of a file with alignments already in it."); - this.headerOverride = header; - } - - private void initializeReadTransformers() { - this.onOutputReadTransformers = new ArrayList<>(engine.getReadTransformers().size()); - for ( final ReadTransformer transformer : engine.getReadTransformers() ) { - if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT ) - onOutputReadTransformers.add(transformer); - } - } - - /** - * @{inheritDoc} - */ - public void addAlignment( final SAMRecord readIn ) { - if ( onOutputReadTransformers == null ) - initializeReadTransformers(); - - GATKSAMRecord workingRead = (GATKSAMRecord)readIn; - - // run on output read transformers - for ( final ReadTransformer transform : onOutputReadTransformers ) - workingRead = transform.apply(workingRead); - - writeStarted = true; - outputTracker.getStorage(this).addAlignment(workingRead); - } - - /** - * @{inheritDoc} - */ - public void close() { - outputTracker.getStorage(this).close(); - } - - /** - * @throws java.lang.UnsupportedOperationException No progress logging in this implementation. - */ - @Override - public void setProgressLogger(final ProgressLoggerInterface logger) { - throw new UnsupportedOperationException("Progress logging not supported"); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java deleted file mode 100644 index 686133922..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VCFWriterArgumentTypeDescriptor.java +++ /dev/null @@ -1,148 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.tribble.AbstractFeatureReader; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.File; -import java.io.OutputStream; -import java.lang.reflect.Type; -import java.util.Collection; - -/** - * Injects new command-line arguments into the system providing support for the genotype writer. - * - * @author mhanna - * @version 0.1 - */ -public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor { - - /** - * The engine into which output stubs should be fed. - */ - private final GenomeAnalysisEngine engine; - - /** - * The default location to which data should be written if the user specifies no such location. - */ - private final OutputStream defaultOutputStream; - - /** - * The sources into which arguments were injected. - */ - private final Collection argumentSources; - - /** - * Create a new GenotypeWriter argument, notifying the given engine when that argument has been created. - * @param engine the engine to be notified. - * @param defaultOutputStream the default output stream to be written to if nothing else is specified. - * @param argumentSources sources from which command-line arguments should be derived. - */ - public VCFWriterArgumentTypeDescriptor(GenomeAnalysisEngine engine, OutputStream defaultOutputStream, Collection argumentSources) { - this.engine = engine; - this.defaultOutputStream = defaultOutputStream; - this.argumentSources = argumentSources; - } - - /** - * Reports whether this ArgumentTypeDescriptor supports the given type. - * @param type The type to check. - * @return True if the argument is a GenotypeWriter. - */ - @Override - public boolean supports( Class type ) { - return VariantContextWriter.class.equals(type); - } - - /** - * This command-line argument descriptor does want to override the provided default value. - * @return true always. - */ - @Override - public boolean createsTypeDefault(ArgumentSource source) { - return !source.isRequired() && source.defaultsToStdout(); - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "stdout"; - } - - @Override - public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { - if(source.isRequired() || !source.defaultsToStdout()) - throw new ReviewedGATKException("BUG: tried to create type default for argument type descriptor that can't support a type default."); - VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); - engine.addOutput(stub); - return stub; - } - - /** - * Convert the given argument matches into a single object suitable for feeding into the ArgumentSource. - * @param source Source for this argument. - * @param type not used - * @param matches Matches that match with this argument. - * @return Transform from the matches into the associated argument. - */ - @Override - public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) { - ArgumentDefinition defaultArgumentDefinition = createDefaultArgumentDefinition(source); - // Get the filename for the genotype file, if it exists. If not, we'll need to send output to out. - ArgumentMatchValue writerFileName = getArgumentValue(defaultArgumentDefinition,matches); - File writerFile = writerFileName != null ? writerFileName.asFile() : null; - - // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object; - // therefore, the user must have failed to specify a type default - if(writerFile == null && source.isRequired()) - throw new MissingArgumentValueException(defaultArgumentDefinition); - - // Create a stub for the given object. - final VariantContextWriterStub stub = (writerFile != null) - ? new VariantContextWriterStub(engine, writerFile, argumentSources) - : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources); - - stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString())); - - // WARNING: Side effects required by engine! - parsingEngine.addTags(stub,getArgumentTags(matches)); - engine.addOutput(stub); - - return stub; - } - - /** - * Returns true if the file will be compressed. - * @param writerFileName Name of the file - * @return true if the file will be compressed. - */ - public static boolean isCompressed(String writerFileName) { - return writerFileName != null && AbstractFeatureReader.hasBlockCompressedExtension(writerFileName); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java deleted file mode 100644 index f40ede581..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/io/stubs/VariantContextWriterStub.java +++ /dev/null @@ -1,301 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.io.stubs; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.tribble.index.IndexCreator; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.io.OutputTracker; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.writer.Options; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.variantcontext.writer.VariantContextWriterFactory; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; - -import java.io.File; -import java.io.OutputStream; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collection; -import java.util.EnumSet; -import java.util.List; - -/** - * A stub for routing and management of genotype reading and writing. - * - * @author ebanks - * @version 0.1 - */ -public class VariantContextWriterStub implements Stub, VariantContextWriter { - public final static boolean UPDATE_CONTIG_HEADERS = true; - - /** - * The engine, central to the GATK's processing. - */ - private final GenomeAnalysisEngine engine; - - /** - * The file that this stub should write to. Should be mutually - * exclusive with genotypeStream. - */ - private final File genotypeFile; - - /** - * The output stream to which stub data should be written. Will be - * mutually exclusive with genotypeFile. - */ - private final PrintStream genotypeStream; - - /** - * A hack: push the argument sources into the VCF header so that the VCF header - * can rebuild the command-line arguments. - */ - private final Collection argumentSources; - - /** - * Which IndexCreator to use - */ - private final IndexCreator indexCreator; - - /** - * The cached VCF header (initialized to null) - */ - private VCFHeader vcfHeader = null; - - /** - * Should we emit a compressed output stream? - */ - private boolean isCompressed = false; - - /** - * Should the header be written out? A hidden argument. - */ - private boolean skipWritingCommandLineHeader = false; - - /** - * Should we not write genotypes even when provided? - */ - private boolean doNotWriteGenotypes = false; - - /** - * Should we force BCF writing regardless of the file extension? - */ - private boolean forceBCF = false; - - /** - * Should we write all of the fields in the FORMAT field, even if missing fields could be trimmed? - */ - private boolean writeFullFormatField = false; - - /** - * Connects this stub with an external stream capable of serving the - * requests of the consumer of this stub. - */ - protected OutputTracker outputTracker = null; - - /** - * Create a new stub given the requested file. - * - * @param engine engine. - * @param genotypeFile file to (ultimately) create. - * @param argumentSources sources. - */ - public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, Collection argumentSources) { - this.engine = engine; - this.genotypeFile = genotypeFile; - this.genotypeStream = null; - this.indexCreator = GATKVCFUtils.getIndexCreator(engine.getArguments().variant_index_type, engine.getArguments().variant_index_parameter, genotypeFile); - this.argumentSources = argumentSources; - } - - /** - * Create a new stub given the requested file. - * - * @param engine engine. - * @param genotypeStream stream to (ultimately) write. - * @param argumentSources sources. - */ - public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, Collection argumentSources) { - this.engine = engine; - this.genotypeFile = null; - this.genotypeStream = new PrintStream(genotypeStream); - this.indexCreator = null; - this.argumentSources = argumentSources; - } - - /** - * Retrieves the file to (ultimately) be created. - * @return The file. Can be null if genotypeStream is not. - */ - public File getOutputFile() { - return genotypeFile; - } - - /** - * Retrieves the output stream to which to (ultimately) write. - * @return The file. Can be null if genotypeFile is not. - */ - public OutputStream getOutputStream() { - return genotypeStream; - } - - public boolean isCompressed() { - return isCompressed; - } - - public void setCompressed(final boolean compressed) { - isCompressed = compressed; - } - - public void setSkipWritingCommandLineHeader(final boolean skipWritingCommandLineHeader) { - this.skipWritingCommandLineHeader = skipWritingCommandLineHeader; - } - - public void setDoNotWriteGenotypes(final boolean doNotWriteGenotypes) { - this.doNotWriteGenotypes = doNotWriteGenotypes; - } - - public void setForceBCF(final boolean forceBCF) { - this.forceBCF = forceBCF; - } - - public void setWriteFullFormatField(final boolean writeFullFormatField) { - this.writeFullFormatField = writeFullFormatField; - } - - public IndexCreator getIndexCreator() { - return indexCreator; - } - - /** - * Gets the master sequence dictionary from the engine associated with this stub - * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return the master sequence dictionary from the engine associated with this stub - */ - public SAMSequenceDictionary getMasterSequenceDictionary() { - return engine.getMasterSequenceDictionary(); - } - - public EnumSet getWriterOptions() { - return getWriterOptions(false); - } - - public EnumSet getWriterOptions(boolean indexOnTheFly) { - final List options = new ArrayList<>(); - - if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES); - if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER); - if ( indexOnTheFly) options.add(Options.INDEX_ON_THE_FLY); - if ( writeFullFormatField ) options.add(Options.WRITE_FULL_FORMAT_FIELD); - - if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) ) - options.add(Options.FORCE_BCF); - - return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options); - } - - /** - * Retrieves the header to use when creating the new file. - * @return header to use when creating the new file. - */ - public VCFHeader getVCFHeader() { - return vcfHeader; - } - - /** - * Registers the given streamConnector with this stub. - * @param outputTracker The connector used to provide an appropriate stream. - */ - public void register( OutputTracker outputTracker ) { - this.outputTracker = outputTracker; - } - - @Override - public void processArguments( final GATKArgumentCollection argumentCollection ) { - setDoNotWriteGenotypes(argumentCollection.sitesOnlyVCF); - setSkipWritingCommandLineHeader(argumentCollection.disableCommandLineInVCF); - setForceBCF(argumentCollection.forceBCFOutput); - setWriteFullFormatField(argumentCollection.neverTrimVCFFormatField); - } - - public void writeHeader(VCFHeader header) { - vcfHeader = header; - - if ( header.isWriteEngineHeaders() ) { - // skip writing the command line header if requested - if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) { - // Always add the header line, as the current format allows multiple entries - final VCFHeaderLine commandLineArgHeaderLine = GATKVCFUtils.getCommandLineArgumentHeaderLine(engine, argumentSources); - vcfHeader.addMetaDataLine(commandLineArgHeaderLine); - } - - if ( UPDATE_CONTIG_HEADERS ) - vcfHeader = GATKVCFUtils.withUpdatedContigs(vcfHeader, engine); - } - - outputTracker.getStorage(this).writeHeader(vcfHeader); - } - - /** - * @{inheritDoc} - */ - public void add(VariantContext vc) { - outputTracker.getStorage(this).add(vc); - } - - /** - * @{inheritDoc} - */ - public void close() { - outputTracker.getStorage(this).close(); - } - - /** - * Gets a string representation of this object. - * @return a string representation of this object. - */ - @Override - public String toString() { - return getClass().getName(); - } - - /** - * Should we also write a BCF file alongside our VCF file for testing - * - * TODO -- remove me when argument generateShadowBCF is removed - * - * @return - */ - public boolean alsoWriteBCFForTest() { - return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded - ! isCompressed() && // for non-compressed outputs - getOutputFile() != null && // that are going to disk - engine.getArguments().generateShadowBCF; // and we actually want to do it - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java deleted file mode 100644 index cb696e58e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIterator.java +++ /dev/null @@ -1,159 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.MergingSamRecordIterator; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; - -import java.util.Iterator; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - *

- * Class BoundedReadIterator - *

- * This class implements a read iterator that is bounded by the number of reads - * it will produce over the iteration. - */ -public class BoundedReadIterator implements GATKSAMIterator { - - // the genome loc we're bounding - final private long readCount; - private long currentCount = 0; - - // the iterator we want to decorate - private final GATKSAMIterator iterator; - - // our unmapped read flag - private boolean doNotUseThatUnmappedReadPile = false; - - /** - * The next read that we've buffered. Null indicates that there's - * nothing in the buffer (not that there isn't a next read). - */ - private SAMRecord record = null; - - /** - * constructor - * @param iter - * @param readCount - */ - public BoundedReadIterator(GATKSAMIterator iter, long readCount) { - this.iterator = iter; - this.readCount = readCount; - } - - public void useUnmappedReads(boolean useThem) { - this.doNotUseThatUnmappedReadPile = useThem; - } - - public SAMFileHeader getHeader() { - // todo: this is bad, we need an iterface out there for samrecords that supports getting the header, - // regardless of the merging - if (iterator instanceof MergingSamRecordIterator) - return ((MergingSamRecordIterator)iterator).getMergedHeader(); - else - return null; - } - - /** - * Do we have a next? If the iterator has a read and we're not over the read - * count, then yes - * @return - */ - public boolean hasNext() { - if( record != null ) - return true; - - if (iterator.hasNext() && currentCount < readCount) { - record = iterator.next(); - ++currentCount; - if (record.getAlignmentStart() == 0 && doNotUseThatUnmappedReadPile) { - return false; - } - return true; - } else { - return false; - } - } - - /** - * get the next SAMRecord - * @return SAMRecord representing the next read - */ - public SAMRecord next() { - SAMRecord cached = record; - record = null; - return cached; - } - - /** - * this is unsupported on SAMRecord iterators - */ - public void remove() { - throw new UnsupportedOperationException("You cannot use an iterator to remove a SAMRecord"); - } - - /** - * close the iterator - */ - public void close() { - iterator.close(); - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIterator.java deleted file mode 100644 index 8ca5cfdbe..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIterator.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; -/** - * - * User: aaron - * Date: May 6, 2009 - * Time: 5:30:41 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - -/** - * @author aaron - * @version 1.0 - * @date May 6, 2009 - *

- * Interface GATKSAMIterator - *

- * This is the standard interface for all iterators in the GATK package that iterate over SAMRecords - */ -public interface GATKSAMIterator extends CloseableIterator, Iterable { -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapter.java deleted file mode 100644 index 0dc3e62a7..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapter.java +++ /dev/null @@ -1,136 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; - -import java.util.Iterator; - -/** - * - * User: aaron - * Date: May 13, 2009 - * Time: 6:33:15 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date May 13, 2009 - *

- * Class GATKSAMIteratorAdapter - *

- * This class adapts other SAMRecord iterators to the GATKSAMIterator - */ -public class GATKSAMIteratorAdapter { - - public static GATKSAMIterator adapt(Iterator iter) { - return new PrivateStringSAMIterator(iter); - } - - public static GATKSAMIterator adapt(CloseableIterator iter) { - return new PrivateStringSAMCloseableIterator(iter); - } - -} - - -/** - * this class wraps iterators in a GATKSAMIterator, which means just adding the - * methods that implement the iterable<> interface and the close() method from CloseableIterator - */ -class PrivateStringSAMIterator implements GATKSAMIterator { - private Iterator iter = null; - - PrivateStringSAMIterator(Iterator iter) { - this.iter = iter; - } - - public void close() { - // do nothing, we can't close the iterator anyway. - } - - public boolean hasNext() { - return iter.hasNext(); - } - - public SAMRecord next() { - return iter.next(); - } - - public void remove() { - throw new UnsupportedOperationException("GATKSAMIterator's don't allow remove()ing"); - } - - public Iterator iterator() { - return iter; - } -} - - -/** - * this class wraps closeable iterators in a GATKSAMIterator, which means adding the - * methods that implement the iterable<> interface. - */ -class PrivateStringSAMCloseableIterator implements GATKSAMIterator { - private CloseableIterator iter = null; - - PrivateStringSAMCloseableIterator(CloseableIterator iter) { - this.iter = iter; - } - - public void close() { - iter.close(); - } - - public boolean hasNext() { - return iter.hasNext(); - } - - public SAMRecord next() { - return iter.next(); - } - - public void remove() { - throw new UnsupportedOperationException("GATKSAMIterator's don't allow remove()ing"); - } - - public Iterator iterator() { - return iter; - } -} - diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMRecordIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMRecordIterator.java deleted file mode 100644 index 6d02acd4a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/GATKSAMRecordIterator.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - -/** - * Temporarily hack to convert SAMRecords to GATKSAMRecords - * - * User: depristo - * Date: 1/11/13 - * Time: 1:19 PM - */ -public class GATKSAMRecordIterator implements CloseableIterator, Iterable { - final CloseableIterator it; - - public GATKSAMRecordIterator(final CloseableIterator it) { - this.it = it; - } - - public GATKSAMRecordIterator(final GATKSAMIterator it) { - this.it = it; - } - - @Override public boolean hasNext() { return it.hasNext(); } - @Override public GATKSAMRecord next() { return (GATKSAMRecord)it.next(); } - @Override public void remove() { it.remove(); } - @Override public void close() { it.close(); } - @Override public Iterator iterator() { return this; } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java deleted file mode 100644 index fa130f930..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/NullSAMIterator.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; - -import java.util.Iterator; -import java.util.NoSuchElementException; -/** - * User: hanna - * Date: May 19, 2009 - * Time: 6:47:16 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * A placeholder for an iterator with no data. - */ -public class NullSAMIterator implements GATKSAMIterator { - public NullSAMIterator() {} - - public Iterator iterator() { return this; } - public void close() { /* NO-OP */ } - - public boolean hasNext() { return false; } - public SAMRecord next() { throw new NoSuchElementException("No next element is available."); } - public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java deleted file mode 100644 index 2eba344bb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PositionTrackingIterator.java +++ /dev/null @@ -1,105 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; - -/** - * Iterates through a list of elements, tracking the number of elements it has seen. - * @author hanna - * @version 0.1 - */ -public class PositionTrackingIterator implements GATKSAMIterator { - /** - * The iterator being tracked. - */ - private CloseableIterator iterator; - - /** - * Current position within the tracked iterator. - */ - private long position; - - /** - * Retrieves the current position of the iterator. The 'current position' of the iterator is defined as - * the coordinate of the read that will be returned if next() is called. - * @return The current position of the iterator. - */ - public long getPosition() { - return position; - } - - /** - * Create a new iterator wrapping the given position, assuming that the reader is position reads - * into the sequence. - * @param iterator Iterator to wraps. - * @param position Non-negative position where the iterator currently sits. - */ - public PositionTrackingIterator(CloseableIterator iterator, long position ) { - this.iterator = iterator; - this.position = position; - } - - /** - * {@inheritDoc} - */ - public boolean hasNext() { - return iterator.hasNext(); - } - - /** - * Try to get the next read in the list. If a next read is available, increment the position. - * @return next read in the list, if available. - */ - public SAMRecord next() { - try { - return iterator.next(); - } - finally { - position++; - } - } - - /** - * {@inheritDoc} - */ - public GATKSAMIterator iterator() { - return this; - } - - /** - * {@inheritDoc} - */ - public void close() { - iterator.close(); - } - - /** - * {@inheritDoc} - */ - public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKSAMIterator"); } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PushbackIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PushbackIterator.java deleted file mode 100644 index 0bb545b6e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/PushbackIterator.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import java.util.Iterator; - -public class PushbackIterator implements Iterator, Iterable { - Iterator underlyingIterator; - T pushedElement = null; - - public PushbackIterator(final Iterator underlyingIterator) { - this.underlyingIterator = underlyingIterator; - } - - public boolean hasNext() { - return pushedElement != null || underlyingIterator.hasNext(); - } - - public Iterator iterator() { - return this; - } - - /** - * Retrieves, but does not remove, the head of this iterator. - * @return T the next element in the iterator - */ - public T element() { - T x = next(); - pushback(x); - return x; - } - - /** - * @return the next element in the iteration. - */ - public T next() { - if (pushedElement != null) { - final T ret = pushedElement; - pushedElement = null; - return ret; - } else { - return underlyingIterator.next(); - } - } - - public void pushback(T elt) { - assert(pushedElement == null); - - pushedElement = elt; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - - public Iterator getUnderlyingIterator() { - return underlyingIterator; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java deleted file mode 100644 index 492227932..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIterator.java +++ /dev/null @@ -1,140 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.sam.AlignmentUtils; - -/** - * An iterator which does post-processing of a read, including potentially wrapping - * the read in something with a compatible interface or replacing the read entirely. - * - * @author mhanna - * @version 0.1 - */ -public class ReadFormattingIterator implements GATKSAMIterator { - /** - * Logger. - */ - final protected static Logger logger = Logger.getLogger(ReadFormattingIterator.class); - - /** - * Iterator to which to pass - */ - private GATKSAMIterator wrappedIterator; - - /** - * True if original base qualities should be used. - */ - private final boolean useOriginalBaseQualities; - - /** - * Positive if there is a default Base Quality value to fill in the reads with. - */ - private final byte defaultBaseQualities; - - - /** - * Decorate the given iterator inside a ReadWrappingIterator. - * @param wrappedIterator iterator - * @param useOriginalBaseQualities true if original base qualities should be used - * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. - */ - public ReadFormattingIterator(GATKSAMIterator wrappedIterator, boolean useOriginalBaseQualities, byte defaultBaseQualities) { - this.wrappedIterator = wrappedIterator; - this.useOriginalBaseQualities = useOriginalBaseQualities; - this.defaultBaseQualities = defaultBaseQualities; - - } - - /** - * Convenience function for use in foreach loops. Dangerous because it does not actually - * reset the iterator. - * @return An iterator through the current data stream. - */ - public GATKSAMIterator iterator() { - // NOTE: this iterator doesn't perform any kind of reset operation; it just returns itself. - // can we do something better? Do we really have to provide support for the Iterable interface? - return this; - } - - /** - * Close this iterator. - */ - public void close() { - wrappedIterator.close(); - } - - /** - * Does the iterator contain more values? - * @return True if there are more left to return, false otherwise. - */ - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - /** - * Get the next value in the sequence. - * @return Next value in the sequence. By convention, a NoSuchElementException should be thrown if - * no next exists. - */ - public SAMRecord next() { - SAMRecord rec = wrappedIterator.next(); - - // Always consolidate the cigar string into canonical form, collapsing zero-length / repeated cigar elements. - // Downstream code (like LocusIteratorByState) cannot necessarily handle non-consolidated cigar strings. - rec.setCigar(AlignmentUtils.consolidateCigar(rec.getCigar())); - - // if we are using default quals, check if we need them, and add if necessary. - // 1. we need if reads are lacking or have incomplete quality scores - // 2. we add if defaultBaseQualities has a positive value - if (defaultBaseQualities >= 0) { - byte reads [] = rec.getReadBases(); - byte quals [] = rec.getBaseQualities(); - if (quals == null || quals.length < reads.length) { - byte new_quals [] = new byte [reads.length]; - for (int i=0; i cur.getReferenceIndex()) || - (last.getReferenceIndex().equals(cur.getReferenceIndex()) && - last.getAlignmentStart() > cur.getAlignmentStart()); - } - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - public void close() { - it.close(); - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java deleted file mode 100644 index f8126b9a9..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReport.java +++ /dev/null @@ -1,786 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.phonehome; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.crypt.CryptUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.io.IOUtils; -import org.broadinstitute.gatk.utils.io.Resource; -import org.broadinstitute.gatk.utils.threading.ThreadEfficiencyMonitor; -import org.jets3t.service.S3Service; -import org.jets3t.service.S3ServiceException; -import org.jets3t.service.impl.rest.httpclient.RestS3Service; -import org.jets3t.service.model.S3Object; -import org.jets3t.service.security.AWSCredentials; -import org.simpleframework.xml.Element; -import org.simpleframework.xml.Serializer; -import org.simpleframework.xml.core.Persister; - -import java.io.*; -import java.security.NoSuchAlgorithmException; -import java.security.PublicKey; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.util.Arrays; -import java.util.Date; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - - -/** - * A detailed description of a GATK run, and error if applicable. Simply create a GATKRunReport - * with the constructor, providing the walker that was run and the fully instantiated GenomeAnalysisEngine - * after the run finishes and the GATKRunReport will collect all of the report information - * into this object. Call postReport to write out the report, as an XML document, to either STDOUT, - * a file (in which case the output is gzipped), or with no arguments the report will be posted to the - * GATK run report database. - * - * @author depristo - * @since 2010 - */ -public class GATKRunReport { - protected static final String REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports"; - protected static final String TEST_REPORT_BUCKET_NAME = "broad.gsa.gatk.run.reports.test"; - protected final static String AWS_ACCESS_KEY_MD5 = "34d4a26eb2062b3f06e833b28f9a38c6"; - protected final static String AWS_SECRET_KEY_MD5 = "83f2332eec99ef1d7425d5dc5d4b514a"; - - private static final DateFormat DATE_FORMAT = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss"); - - /** - * our log - */ - protected static final Logger logger = Logger.getLogger(GATKRunReport.class); - - /** - * Default value for the number of milliseconds before an S3 put operation is timed-out. - * Can be overridden via a constructor argument. - */ - private static final long S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS = 30 * 1000; - - /** - * Number of milliseconds before an S3 put operation is timed-out. - */ - private long s3PutTimeOutInMilliseconds = S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS; - - // ----------------------------------------------------------------- - // elements captured for the report - // ----------------------------------------------------------------- - - @Element(required = false, name = "id") - private String id; - - @Element(required = false, name = "exception") - private GATKRunReportException mException; - - @Element(required = true, name = "start-time") - private String startTime = "ND"; - - @Element(required = true, name = "end-time") - private String endTime; - - @Element(required = true, name = "run-time") - private long runTime = 0; - - @Element(required = true, name = "walker-name") - private String walkerName; - - @Element(required = true, name = "svn-version") - private String svnVersion; - - @Element(required = true, name = "total-memory") - private long totalMemory; - - @Element(required = true, name = "max-memory") - private long maxMemory; - - @Element(required = true, name = "user-name") - private String userName; - - @Element(required = true, name = "host-name") - private String hostName; - - @Element(required = true, name = "java") - private String javaVersion; - - @Element(required = true, name = "machine") - private String machine; - - @Element(required = true, name = "iterations") - private long nIterations; - - @Element(required = true, name = "tag") - private String tag; - - @Element(required = true, name = "num-threads") - private int numThreads; - @Element(required = true, name = "percent-time-running") - private String percentTimeRunning; - @Element(required = true, name = "percent-time-waiting") - private String percentTimeWaiting; - @Element(required = true, name = "percent-time-blocking") - private String percentTimeBlocking; - @Element(required = true, name = "percent-time-waiting-for-io") - private String percentTimeWaitingForIO; - - /** The error message, if one occurred, or null if none did */ - public String errorMessage = null; - /** The error that occurred, if one did, or null if none did */ - public Throwable errorThrown = null; - - /** - * How should the GATK report its usage? - */ - public enum PhoneHomeOption { - /** Disable phone home */ - NO_ET, - /** Forces the report to go to S3 */ - AWS, - /** Force output to STDOUT. For debugging only */ - STDOUT - } - - /** - * To allow us to deserial reports from XML - */ - private GATKRunReport() { } - - /** - * Read a GATKRunReport from the serialized XML representation in String reportAsXML - * @param stream an input stream containing a serialized XML report - * @return a reconstituted GATKRunReport from reportAsXML - * @throws Exception if parsing fails for any reason - */ - @Ensures("result != null") - protected static GATKRunReport deserializeReport(final InputStream stream) throws Exception { - final Serializer serializer = new Persister(); - return serializer.read(GATKRunReport.class, stream); - } - - /** - * Create a new GATKRunReport from a report on S3 - * - * Assumes that s3Object has already been written to S3, and this function merely - * fetches it from S3 and deserializes it. The access keys must have permission to - * GetObject from S3. - * - * @param downloaderAccessKey AWS access key with permission to GetObject from bucketName - * @param downloaderSecretKey AWS secret key with permission to GetObject from bucketName - * @param bucketName the name of the bucket holding the report - * @param s3Object the s3Object we wrote to S3 in bucketName that we want to get back and decode - * @return a deserialized report derived from s3://bucketName/s3Object.getName() - * @throws Exception - */ - @Ensures("result != null") - protected static GATKRunReport deserializeReport(final String downloaderAccessKey, - final String downloaderSecretKey, - final String bucketName, - final S3Object s3Object) throws Exception { - final S3Service s3Service = initializeAWSService(downloaderAccessKey, downloaderSecretKey); - - // Retrieve the whole data object we created previously - final S3Object objectComplete = s3Service.getObject(bucketName, s3Object.getName()); - - // Read the data from the object's DataInputStream using a loop, and print it out. - return deserializeReport(new GZIPInputStream(objectComplete.getDataInputStream())); - } - - /** - * Create a new RunReport and population all of the fields with values from the walker and engine. - * Allows the S3 put timeout to be explicitly set. - * - * @param walker the GATK walker that we ran - * @param e the exception caused by running this walker, or null if we completed successfully - * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc - * @param type the GATK phone home setting - * @param s3PutTimeOutInMilliseconds number of milliseconds to wait before timing out an S3 put operation - */ - public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type, - final long s3PutTimeOutInMilliseconds) { - this(walker, e, engine, type); - this.s3PutTimeOutInMilliseconds = s3PutTimeOutInMilliseconds; - } - - /** - * Create a new RunReport and population all of the fields with values from the walker and engine. - * Leaves the S3 put timeout set to the default value of S3_DEFAULT_PUT_TIME_OUT_IN_MILLISECONDS. - * - * @param walker the GATK walker that we ran - * @param e the exception caused by running this walker, or null if we completed successfully - * @param engine the GAE we used to run the walker, so we can fetch runtime, args, etc - * @param type the GATK phone home setting - */ - public GATKRunReport(final Walker walker, final Exception e, final GenomeAnalysisEngine engine, final PhoneHomeOption type) { - if ( type == PhoneHomeOption.NO_ET ) - throw new ReviewedGATKException("Trying to create a run report when type is NO_ET!"); - - logger.debug("Aggregating data for run report"); - - // what did we run? - id = org.apache.commons.lang.RandomStringUtils.randomAlphanumeric(32); - walkerName = engine.getWalkerName(walker.getClass()); - svnVersion = CommandLineGATK.getVersionNumber(); - - // runtime performance metrics - Date end = new java.util.Date(); - endTime = DATE_FORMAT.format(end); - if ( engine.getStartTime() != null ) { // made it this far during initialization - startTime = DATE_FORMAT.format(engine.getStartTime()); - runTime = (end.getTime() - engine.getStartTime().getTime()) / 1000L; // difference in seconds - } - - // deal with memory usage - Runtime.getRuntime().gc(); // call GC so totalMemory is ~ used memory - maxMemory = Runtime.getRuntime().maxMemory(); - totalMemory = Runtime.getRuntime().totalMemory(); - - // we can only do some operations if an error hasn't occurred - if ( engine.getCumulativeMetrics() != null ) { - // it's possible we aborted so early that these data structures arent initialized - nIterations = engine.getCumulativeMetrics().getNumIterations(); - } - - tag = engine.getArguments().tag; - - // user and hostname -- information about the runner of the GATK - userName = System.getProperty("user.name"); - hostName = Utils.resolveHostname(); - - // basic java information - javaVersion = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version"))); - machine = Utils.join("-", Arrays.asList(System.getProperty("os.name"), System.getProperty("os.arch"))); - - // if there was an exception, capture it - this.mException = e == null ? null : new GATKRunReportException(e); - - numThreads = engine.getTotalNumberOfThreads(); - percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); - percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); - percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); - percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO); - } - - /** - * Get the random alpha-numeric ID of this GATKRunReport - * @return a non-null string ID - */ - @Ensures("result != null") - public String getID() { - return id; - } - - /** - * Return a string representing the percent of time the GATK spent in state, if possible. Otherwise return NA - * - * @param engine the GATK engine whose threading efficiency info we will use - * @param state the state whose occupancy we wish to know - * @return a string representation of the percent occupancy of state, or NA is not possible - */ - @Requires({"engine != null", "state != null"}) - @Ensures("result != null") - private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) { - final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor(); - return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state)); - } - - /** - * Get a filename (no path) appropriate for this report - * - * @return a non-null string filename - */ - @Ensures("result != null") - protected String getReportFileName() { - return getID() + ".report.xml.gz"; - } - - // --------------------------------------------------------------------------- - // - // Main public interface method for posting reports - // - // --------------------------------------------------------------------------- - - /** - * Post this GATK report to the destination implied by the PhoneHomeOption type - * - * Guaranteed to never throw an exception (exception noted below) and to return - * with a reasonable (~10 seconds) time regardless of successful writing of the report. - * - * @throws IllegalArgumentException if type == null - * @param type the type of phoning home we want to do - * @return true if a report was successfully written, false otherwise - */ - public boolean postReport(final PhoneHomeOption type) { - if ( type == null ) throw new IllegalArgumentException("type cannot be null"); - - logger.debug("Posting report of type " + type); - switch (type) { - case NO_ET: // don't do anything - return false; - case AWS: - wentToAWS = true; - return postReportToAWSS3() != null; - case STDOUT: - return postReportToStream(System.out); - default: - exceptDuringRunReport("BUG: unexpected PhoneHomeOption "); - return false; - } - } - - // --------------------------------------------------------------------------- - // - // Code for sending reports to local files - // - // --------------------------------------------------------------------------- - - /** - * Write an XML representation of this report to the stream, throwing a GATKException if the marshalling - * fails for any reason. - * - * @param stream an output stream to write the report to - */ - @Requires("stream != null") - protected boolean postReportToStream(final OutputStream stream) { - final Serializer serializer = new Persister(); - try { - serializer.write(this, stream); - return true; - } catch (Exception e) { - return false; - } - } - - // --------------------------------------------------------------------------- - // - // Code for sending reports to s3 - // - // --------------------------------------------------------------------------- - - /** - * Get the name of the S3 bucket where we should upload this report - * - * @return the string name of the s3 bucket - */ - @Ensures("result != null") - protected String getS3ReportBucket() { - return s3ReportBucket; - } - - /** - * Decrypts encrypted AWS key from encryptedKeySource - * @param encryptedKeySource a file containing an encrypted AWS key - * @return a decrypted AWS key as a String - */ - @Ensures("result != null") - public static String decryptAWSKey(final File encryptedKeySource) throws FileNotFoundException { - if ( encryptedKeySource == null ) throw new IllegalArgumentException("encryptedKeySource cannot be null"); - return decryptAWSKey(new FileInputStream(encryptedKeySource)); - } - - /** - * @see #decryptAWSKey(java.io.File) but with input from an inputstream - */ - @Requires("encryptedKeySource != null") - @Ensures("result != null") - private static String decryptAWSKey(final InputStream encryptedKeySource) { - final PublicKey key = CryptUtils.loadGATKDistributedPublicKey(); - final byte[] fromDisk = IOUtils.readStreamIntoByteArray(encryptedKeySource); - final byte[] decrypted = CryptUtils.decryptData(fromDisk, key); - return new String(decrypted); - } - - /** - * Get the decrypted AWS key sorted in the resource directories of name - * @param name the name of the file containing the needed AWS key - * @return a non-null GATK - */ - @Requires("name != null") - @Ensures("result != null") - private static String getAWSKey(final String name) { - final Resource resource = new Resource(name, GATKRunReport.class); - return decryptAWSKey(resource.getResourceContentsAsStream()); - } - - /** - * Get the AWS access key for the GATK user - * @return a non-null AWS access key for the GATK user - */ - @Ensures("result != null") - protected static String getAWSUploadAccessKey() { - return getAWSKey("resources/GATK_AWS_access.key"); - } - - /** - * Get the AWS secret key for the GATK user - * @return a non-null AWS secret key for the GATK user - */ - @Ensures("result != null") - protected static String getAWSUploadSecretKey() { - return getAWSKey("resources/GATK_AWS_secret.key"); - } - - /** - * Check that the AWS keys can be decrypted and are what we expect them to be - * - * @throws ReviewedGATKException if anything goes wrong - */ - public static void checkAWSAreValid() { - try { - final String accessKeyMD5 = Utils.calcMD5(getAWSUploadAccessKey()); - final String secretKeyMD5 = Utils.calcMD5(getAWSUploadSecretKey()); - - if ( ! AWS_ACCESS_KEY_MD5.equals(accessKeyMD5) ) { - throw new ReviewedGATKException("Invalid AWS access key found, expected MD5 " + AWS_ACCESS_KEY_MD5 + " but got " + accessKeyMD5); - } - if ( ! AWS_SECRET_KEY_MD5.equals(secretKeyMD5) ) { - throw new ReviewedGATKException("Invalid AWS secret key found, expected MD5 " + AWS_SECRET_KEY_MD5 + " but got " + secretKeyMD5); - } - - } catch ( Exception e ) { - throw new ReviewedGATKException("Couldn't decrypt AWS keys, something is wrong with the GATK distribution"); - } - } - - /** - * Get an initialized S3Service for use in communicating with AWS/s3 - * - * @param awsAccessKey our AWS access key to use - * @param awsSecretKey our AWS secret key to use - * @return an initialized S3Service object that can be immediately used to interact with S3 - * @throws S3ServiceException - */ - @Requires({"awsAccessKey != null", "awsSecretKey != null"}) - @Ensures("result != null") - protected static S3Service initializeAWSService(final String awsAccessKey, final String awsSecretKey) throws S3ServiceException { - // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP - // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. - final AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); - return new RestS3Service(awsCredentials); - } - - /** - * A runnable that pushes this GATKReport up to s3. - * - * Should be run in a separate thread so we can time it out if something is taking too long - */ - private class S3PutRunnable implements Runnable { - /** Was the upload operation successful? */ - public final AtomicBoolean isSuccess; - /** The name of this report */ - private final String filename; - /** The contents of this report */ - private final byte[] contents; - - /** The s3Object that we created to upload, or null if it failed */ - public S3Object s3Object = null; - - @Requires({"filename != null", "contents != null"}) - public S3PutRunnable(final String filename, final byte[] contents){ - this.isSuccess = new AtomicBoolean(); - this.filename = filename; - this.contents = contents; - } - - public void run() { - try { - switch ( awsMode ) { - case FAIL_WITH_EXCEPTION: - throw new IllegalStateException("We are throwing an exception for testing purposes"); - case TIMEOUT: - try { - Thread.sleep(s3PutTimeOutInMilliseconds * 100); - } catch ( InterruptedException e ) { - // supposed to be empty - } - break; - case NORMAL: - // IAM GATK user credentials -- only right is to PutObject into broad.gsa.gatk.run.reports bucket - final S3Service s3Service = initializeAWSService(getAWSUploadAccessKey(), getAWSUploadSecretKey()); - - // Create an S3Object based on a file, with Content-Length set automatically and - // Content-Type set based on the file's extension (using the Mimetypes utility class) - final S3Object fileObject = new S3Object(filename, contents); - //logger.info("Created S3Object" + fileObject); - //logger.info("Uploading " + localFile + " to AWS bucket"); - s3Object = s3Service.putObject(getS3ReportBucket(), fileObject); - isSuccess.set(true); - break; - default: - throw new IllegalStateException("Unexpected AWS exception"); - } - } catch ( S3ServiceException e ) { - exceptDuringRunReport("S3 exception occurred", e); - } catch ( NoSuchAlgorithmException e ) { - exceptDuringRunReport("Couldn't calculate MD5", e); - } catch ( IOException e ) { - exceptDuringRunReport("Couldn't read report file", e); - } catch ( Exception e ) { - exceptDuringRunReport("An unexpected exception occurred during posting", e); - } - } - } - - /** - * Post this GATK report to the AWS s3 GATK_Run_Report log - * - * @return the s3Object pointing to our pushed report, or null if we failed to push - */ - protected S3Object postReportToAWSS3() { - // modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html - this.hostName = Utils.resolveHostname(); // we want to fill in the host name - final String key = getReportFileName(); - logger.debug("Generating GATK report to AWS S3 with key " + key); - - try { - // create an byte output stream so we can capture the output as a byte[] - final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096); - final OutputStream outputStream = new GZIPOutputStream(byteStream); - postReportToStream(outputStream); - outputStream.close(); - final byte[] report = byteStream.toByteArray(); - - // stop us from printing the annoying, and meaningless, mime types warning - final Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); - mimeTypeLogger.setLevel(Level.FATAL); - - // Set the S3 upload on its own thread with timeout: - final S3PutRunnable s3run = new S3PutRunnable(key,report); - final Thread s3thread = new Thread(s3run); - s3thread.setDaemon(true); - s3thread.setName("S3Put-Thread"); - s3thread.start(); - - s3thread.join(s3PutTimeOutInMilliseconds); - - if(s3thread.isAlive()){ - s3thread.interrupt(); - exceptDuringRunReport("Run statistics report upload to AWS S3 timed-out"); - } else if(s3run.isSuccess.get()) { - logger.info("Uploaded run statistics report to AWS S3"); - logger.debug("Uploaded to AWS: " + s3run.s3Object); - return s3run.s3Object; - } else { - // an exception occurred, the thread should have already invoked the exceptDuringRunReport function - } - } catch ( IOException e ) { - exceptDuringRunReport("Couldn't read report file", e); - } catch ( InterruptedException e) { - exceptDuringRunReport("Run statistics report upload interrupted", e); - } - - return null; - } - - // --------------------------------------------------------------------------- - // - // Error handling code - // - // --------------------------------------------------------------------------- - - /** - * Note that an exception occurred during creating or writing this report - * @param msg the message to print - * @param e the exception that occurred - */ - @Ensures("exceptionOccurredDuringPost()") - private void exceptDuringRunReport(final String msg, final Throwable e) { - this.errorMessage = msg; - this.errorThrown = e; - logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is: " + msg + ". Error message is: " + e.getMessage()); - } - - /** - * Note that an exception occurred during creating or writing this report - * @param msg the message to print - */ - @Ensures("exceptionOccurredDuringPost()") - private void exceptDuringRunReport(final String msg) { - this.errorMessage = msg; - logger.debug("A problem occurred during GATK run reporting [*** everything is fine, but no report could be generated; please do not post this to the support forum ***]. Message is " + msg); - } - - /** - * Did an error occur during the posting of this run report? - * @return true if so, false if not - */ - public boolean exceptionOccurredDuringPost() { - return getErrorMessage() != null; - } - - /** - * If an error occurred during posting of this report, retrieve the message of the error that occurred, or null if - * no error occurred - * @return a string describing the error that occurred, or null if none did - */ - public String getErrorMessage() { - return errorMessage; - } - - /** - * Get the throwable that caused the exception during posting of this message, or null if none was available - * - * Note that getting a null valuable from this function doesn't not imply that no error occurred. Some - * errors that occurred many not have generated a throwable. - * - * @return the Throwable that caused the error, or null if no error occurred or was not caused by a throwable - */ - public Throwable getErrorThrown() { - return errorThrown; - } - - /** - * Helper method to format the exception that occurred during posting, or a string saying none occurred - * @return a non-null string - */ - @Ensures("result != null") - protected String formatError() { - return exceptionOccurredDuringPost() - ? String.format("Exception message=%s with cause=%s", getErrorMessage(), getErrorThrown()) - : "No exception occurred"; - } - - // --------------------------------------------------------------------------- - // - // Equals and hashcode -- purely for comparing reports for testing - // - // --------------------------------------------------------------------------- - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - GATKRunReport that = (GATKRunReport) o; - - if (maxMemory != that.maxMemory) return false; - if (nIterations != that.nIterations) return false; - if (numThreads != that.numThreads) return false; - if (runTime != that.runTime) return false; - if (totalMemory != that.totalMemory) return false; - if (endTime != null ? !endTime.equals(that.endTime) : that.endTime != null) return false; - if (hostName != null ? !hostName.equals(that.hostName) : that.hostName != null) return false; - if (id != null ? !id.equals(that.id) : that.id != null) return false; - if (javaVersion != null ? !javaVersion.equals(that.javaVersion) : that.javaVersion != null) return false; - if (mException != null ? !mException.equals(that.mException) : that.mException != null) return false; - if (machine != null ? !machine.equals(that.machine) : that.machine != null) return false; - if (percentTimeBlocking != null ? !percentTimeBlocking.equals(that.percentTimeBlocking) : that.percentTimeBlocking != null) - return false; - if (percentTimeRunning != null ? !percentTimeRunning.equals(that.percentTimeRunning) : that.percentTimeRunning != null) - return false; - if (percentTimeWaiting != null ? !percentTimeWaiting.equals(that.percentTimeWaiting) : that.percentTimeWaiting != null) - return false; - if (percentTimeWaitingForIO != null ? !percentTimeWaitingForIO.equals(that.percentTimeWaitingForIO) : that.percentTimeWaitingForIO != null) - return false; - if (startTime != null ? !startTime.equals(that.startTime) : that.startTime != null) return false; - if (svnVersion != null ? !svnVersion.equals(that.svnVersion) : that.svnVersion != null) return false; - if (tag != null ? !tag.equals(that.tag) : that.tag != null) return false; - if (userName != null ? !userName.equals(that.userName) : that.userName != null) return false; - if (walkerName != null ? !walkerName.equals(that.walkerName) : that.walkerName != null) return false; - - return true; - } - - @Override - public int hashCode() { - int result = id != null ? id.hashCode() : 0; - result = 31 * result + (mException != null ? mException.hashCode() : 0); - result = 31 * result + (startTime != null ? startTime.hashCode() : 0); - result = 31 * result + (endTime != null ? endTime.hashCode() : 0); - result = 31 * result + (int) (runTime ^ (runTime >>> 32)); - result = 31 * result + (walkerName != null ? walkerName.hashCode() : 0); - result = 31 * result + (svnVersion != null ? svnVersion.hashCode() : 0); - result = 31 * result + (int) (totalMemory ^ (totalMemory >>> 32)); - result = 31 * result + (int) (maxMemory ^ (maxMemory >>> 32)); - result = 31 * result + (userName != null ? userName.hashCode() : 0); - result = 31 * result + (hostName != null ? hostName.hashCode() : 0); - result = 31 * result + (javaVersion != null ? javaVersion.hashCode() : 0); - result = 31 * result + (machine != null ? machine.hashCode() : 0); - result = 31 * result + (int) (nIterations ^ (nIterations >>> 32)); - result = 31 * result + (tag != null ? tag.hashCode() : 0); - result = 31 * result + numThreads; - result = 31 * result + (percentTimeRunning != null ? percentTimeRunning.hashCode() : 0); - result = 31 * result + (percentTimeWaiting != null ? percentTimeWaiting.hashCode() : 0); - result = 31 * result + (percentTimeBlocking != null ? percentTimeBlocking.hashCode() : 0); - result = 31 * result + (percentTimeWaitingForIO != null ? percentTimeWaitingForIO.hashCode() : 0); - return result; - } - - // --------------------------------------------------------------------------- - // - // Code specifically for testing the GATKRunReport - // - // --------------------------------------------------------------------------- - - /** - * Enum specifying how the S3 uploader should behave. Must be normal by default. Purely for testing purposes - */ - protected enum AWSMode { - NORMAL, // write normally to AWS - FAIL_WITH_EXCEPTION, // artificially fail during writing - TIMEOUT // sleep, so we time out - } - /** Our AWS mode */ - private AWSMode awsMode = AWSMode.NORMAL; - /** The bucket were we send the GATK report on AWS/s3 */ - private String s3ReportBucket = REPORT_BUCKET_NAME; - /** Did we send the report to AWS? */ - private boolean wentToAWS = false; - - /** - * Send the report to the AWS test bucket -- for testing only - */ - protected void sendAWSToTestBucket() { - s3ReportBucket = TEST_REPORT_BUCKET_NAME; - } - - /** - * Has the report been written to AWS? - * - * Does not imply anything about the success of the send, just that it was attempted - * - * @return true if the report has been sent to AWS, false otherwise - */ - protected boolean wentToAWS() { - return wentToAWS; - } - - /** - * Purely for testing purposes. Tells the AWS uploader whether to actually upload or simulate errors - * @param mode what we want to do - */ - @Requires("mode != null") - protected void setAwsMode(final AWSMode mode) { - this.awsMode = mode; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RODRecordListImpl.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RODRecordListImpl.java deleted file mode 100644 index 7296c39ae..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RODRecordListImpl.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 10, 2009 - * Time: 6:10:48 PM - * To change this template use File | Settings | File Templates. - */ -public class RODRecordListImpl extends AbstractList implements Comparable, Cloneable, RODRecordList, HasGenomeLocation { - private List records; - private GenomeLoc location = null; - private String name = null; - - public RODRecordListImpl(String name) { - records = new ArrayList(); - this.name = name; - } - - /** - * Fully qualified constructor: instantiates a new GATKFeatureRecordList object with specified GATKFeature track name, location on the - * reference, and list of associated GATKFeatures. This is a knee-deep COPY constructor: passed name, loc, and data element - * objects will be referenced from the created GATKFeatureRecordList (so that changing them from outside will affect data - * in this object), however, the data elements will be copied into a newly - * allocated list, so that the 'data' collection argument can be modified afterwards without affecting the state - * of this record list. WARNING: this constructor is (semi-)validating: passed name and location - * are allowed to be nulls (although it maybe unsafe, use caution), but if they are not nulls, then passed non-null GATKFeature data - * elements must have same track name, and their locations must overlap with the passed 'location' argument. Null - * data elements or null 'data' collection argument are allowed as well. - * @param name the name of the track - * @param data the collection of features at this location - * @param loc the location - */ - public RODRecordListImpl(String name, Collection data, GenomeLoc loc) { - this.records = new ArrayList(data==null?0:data.size()); - this.name = name; - this.location = loc; - if ( data == null || data.size() == 0 ) return; // empty dataset, nothing to do - for ( GATKFeature r : data ) { - records.add(r); - if ( r == null ) continue; - if ( ! this.name.equals(r.getName() ) ) { - throw new ReviewedGATKException("Attempt to add GATKFeature with non-matching name "+r.getName()+" to the track "+name); - } - if ( location != null && ! location.overlapsP(r.getLocation()) ) { - throw new ReviewedGATKException("Attempt to add GATKFeature that lies outside of specified interval "+location+"; offending GATKFeature:\n"+r.toString()); - } - } - } - - - public GenomeLoc getLocation() { return location; } - public String getName() { return name; } - public Iterator iterator() { return records.iterator() ; } - public void clear() { records.clear(); } - public boolean isEmpty() { return records.isEmpty(); } - - public boolean add(GATKFeature record) { add(record, false); return true;} - - @Override - public GATKFeature get(int i) { - return records.get(i); - } - - public void add(GATKFeature record, boolean allowNameMismatch) { - if ( record != null ) { - if ( ! allowNameMismatch && ! name.equals(record.getName() ) ) - throw new ReviewedGATKException("Attempt to add GATKFeature with non-matching name "+record.getName()+" to the track "+name); - } - records.add(record); - } - - public void add(RODRecordList records ) { add( records, false ); } - - public void add(RODRecordList records, boolean allowNameMismatch) { - for ( GATKFeature record : records ) - add(record, allowNameMismatch); - } - - public int size() { return records.size() ; } - - /** - * Compares this object with the specified object for order. Returns a - * negative integer, zero, or a positive integer as this object is less - * than, equal to, or greater than the specified object. - * - * @param that the object to be compared. - * @return a negative integer, zero, or a positive integer as this object - * is less than, equal to, or greater than the specified object. - * @throws ClassCastException if the specified object's type prevents it - * from being compared to this object. - */ - public int compareTo(RODRecordList that) { - return getLocation().compareTo(that.getLocation()); //To change body of implemented methods use File | Settings | File Templates. - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTracker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTracker.java deleted file mode 100644 index 7ccf6e572..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTracker.java +++ /dev/null @@ -1,497 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.*; - -/** - * This class represents the Reference Metadata available at a particular site in the genome. It can be - * used to conveniently lookup the RMDs at this site, as well just getting a list of all of the RMDs - * - * The standard interaction model is: - * - * Traversal system arrives at a site, which has a bunch of RMDs covering it - * Traversal passes creates a tracker and passes it to the walker - * walker calls get(rodBinding) to obtain the RMDs values at this site for the track - * associated with rodBinding. - * - * Note that this is an immutable class. Once created the underlying data structures - * cannot be modified - * - * User: mdepristo - * Date: Apr 3, 2009 - * Time: 3:05:23 PM - */ -public class RefMetaDataTracker { - // TODO: this should be a list, not a bindings, actually - private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); - - final Map bindings; - final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); - public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); - - // ------------------------------------------------------------------------------------------ - // - // - // Special ENGINE interaction functions - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Create an tracker with no bindings - */ - public RefMetaDataTracker() { - bindings = Collections.emptyMap(); - } - - public RefMetaDataTracker(final Collection allBindings) { - // set up the bindings - if ( allBindings.isEmpty() ) - bindings = Collections.emptyMap(); - else { - final Map tmap = new HashMap(allBindings.size()); - for ( RODRecordList rod : allBindings ) { - if ( rod != null && ! rod.isEmpty() ) - tmap.put(canonicalName(rod.getName()), rod); - } - - // ensure that no one modifies the bindings itself - bindings = Collections.unmodifiableMap(tmap); - } - } - - // ------------------------------------------------------------------------------------------ - // - // - // Generic accessors - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Gets all of the Tribble features spanning this locus, returning them as a list of specific - * type T extending Feature. This function looks across all tracks to find the Features, so - * if you have two tracks A and B each containing 1 Feature, then getValues will return - * a list containing both features. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. If you want - * to get all Features without any danger of such an exception use the root Tribble - * interface Feature. - * - * @param type The type of the underlying objects bound here - * @param as above - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"type != null"}) - @Ensures("result != null") - public List getValues(final Class type) { - return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); - } - - /** - * Provides the same functionality as @link #getValues(Class) but will only include - * Features that start as the GenomeLoc provide onlyAtThisLoc. - * - * @param type The type of the underlying objects bound here - * @param onlyAtThisLoc - * @param as above - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"type != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { - return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); - } - - /** - * Uses the same logic as @link #getValues(Class) but arbitrary select one of the resulting - * elements of the list to return. That is, if there would be two elements in the result of - * @link #getValues(Class), one of these two is selected, and which one it will be isn't - * specified. Consequently, this method is only really safe if (1) you absolutely know - * that only one binding will meet the constraints of @link #getValues(Class) or (2) - * you truly don't care which of the multiple bindings available you are going to examine. - * - * If there are no bindings here, getFirstValue() return null - * - * @param type The type of the underlying objects bound here - * @param as above - * @return A random single element the RODs bound here, or null if none are bound. - */ - @Requires({"type != null"}) - public T getFirstValue(final Class type) { - return safeGetFirst(getValues(type)); - } - - /** - * Uses the same logic as @link #getValue(Class,GenomeLoc) to determine the list - * of eligible Features and @link #getFirstValue(Class) to select a single - * element from the interval list. - * - * @param type The type of the underlying objects bound here - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the RODs bound here starting at onlyAtThisLoc, or null if none are bound. - */ - @Requires({"type != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(getValues(type, onlyAtThisLoc)); - } - - /** - * Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param prioritizeThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null", "prioritizeThisLoc != null"}) - @Ensures("result != null") - public List getPrioritizedValue(final Collection> rodBindings, final GenomeLoc prioritizeThisLoc) { - final List results = new ArrayList<>(); - - for ( final RodBinding rodBinding : rodBindings ) { - - // if there's a value at the prioritized location, take it - T value = getFirstValue(rodBinding, prioritizeThisLoc); - - // otherwise, grab any one - if ( value == null ) - value = getFirstValue(rodBinding); - - // add if not null - if ( value != null ) - results.add(value); - } - - return results; - } - - /** - * Gets all of the Tribble features bound to RodBinding spanning this locus, returning them as - * a list of specific type T extending Feature. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBinding != null"}) - @Ensures("result != null") - public List getValues(final RodBinding rodBinding) { - return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), null, false, false); - } - - /** - * Gets all of the Tribble features bound to any RodBinding in rodBindings, - * spanning this locus, returning them as a list of specific type T extending Feature. - * - * Note that this function assumes that all of the bound features are instances of or - * subclasses of T. A ClassCastException will occur if this isn't the case. - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null"}) - @Ensures("result != null") - public List getValues(final Collection> rodBindings) { - List results = new ArrayList(1); - for ( RodBinding rodBinding : rodBindings ) - results.addAll(getValues(rodBinding)); - return results; - } - - /** - * The same logic as @link #getValues(RodBinding) but enforces that each Feature start at onlyAtThisLoc - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { - return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), onlyAtThisLoc, true, false); - } - - /** - * The same logic as @link #getValues(List) but enforces that each Feature start at onlyAtThisLoc - * - * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched - * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. - */ - @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) - @Ensures("result != null") - public List getValues(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { - List results = new ArrayList(1); - for ( RodBinding rodBinding : rodBindings ) - results.addAll(getValues(rodBinding, onlyAtThisLoc)); - return results; - } - - /** - * Uses the same logic as @getValues(RodBinding) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param as above - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBinding != null"}) - public T getFirstValue(final RodBinding rodBinding) { - return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), null, false, true)); - } - - /** - * Uses the same logic as @getValues(RodBinding, GenomeLoc) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), onlyAtThisLoc, true, true)); - } - - /** - * Uses the same logic as @getValues(List) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched - * @param as above - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBindings != null"}) - public T getFirstValue(final Collection> rodBindings) { - for ( RodBinding rodBinding : rodBindings ) { - T val = getFirstValue(rodBinding); - if ( val != null ) - return val; - } - return null; - } - - /** - * Uses the same logic as @getValues(RodBinding,GenomeLoc) to determine the list - * of eligible Features and select a single element from the resulting set - * of eligible features. - * - * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched - * @param as above - * @param onlyAtThisLoc only Features starting at this site are considered - * @return A random single element the eligible Features found, or null if none are bound. - */ - @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) - public T getFirstValue(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { - for ( RodBinding rodBinding : rodBindings ) { - T val = getFirstValue(rodBinding, onlyAtThisLoc); - if ( val != null ) - return val; - } - return null; - } - - /** - * Is there a binding at this site to a ROD/track with the specified name? - * - * @param rodBinding the rod binding we want to know about - * @return true if any Features are bound in this tracker to rodBinding - */ - @Requires({"rodBinding != null"}) - public boolean hasValues(final RodBinding rodBinding) { - return bindings.containsKey(canonicalName(rodBinding.getName())); - } - - /** - * Get all of the RMD tracks at the current site. Each track is returned as a single compound - * object (RODRecordList) that may contain multiple RMD records associated with the current site. - * - * @return List of all tracks - */ - public List getBoundRodTracks() { - return new ArrayList(bindings.values()); - } - - /** - * The number of tracks with at least one value bound here - * @return the number of tracks with at least one bound Feature - */ - public int getNTracksWithBoundFeatures() { - return bindings.size(); - } - - // ------------------------------------------------------------------------------------------ - // Protected accessors using strings for unit testing - // ------------------------------------------------------------------------------------------ - - protected boolean hasValues(final String name) { - return bindings.containsKey(canonicalName(name)); - } - - protected List getValues(final Class type, final String name) { - return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); - } - - protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { - return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); - } - - protected T getFirstValue(final Class type, final String name) { - return safeGetFirst(getValues(type, name)); - } - - protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { - return safeGetFirst(getValues(type, name, onlyAtThisLoc)); - } - - // ------------------------------------------------------------------------------------------ - // - // - // Private utility functions - // - // - // ------------------------------------------------------------------------------------------ - - /** - * Helper function for getFirst() operations that takes a list of and - * returns the first element, or null if no such element exists. - * - * @param l - * @param - * @return - */ - @Requires({"l != null"}) - private T safeGetFirst(final List l) { - return l.isEmpty() ? null : l.get(0); - } - - private List addValues(final Collection names, - final Class type, - List values, - final GenomeLoc curLocation, - final boolean requireStartHere, - final boolean takeFirstOnly ) { - for ( String name : names ) { - RODRecordList rodList = getTrackDataByName(name); // require that the name is an exact match - values = addValues(name, type, values, rodList, curLocation, requireStartHere, takeFirstOnly ); - if ( takeFirstOnly && ! values.isEmpty() ) - break; - } - - return values; - } - - - - private List addValues(final String name, - final Class type, - List values, - final RODRecordList rodList, - final GenomeLoc curLocation, - final boolean requireStartHere, - final boolean takeFirstOnly ) { - for ( GATKFeature rec : rodList ) { - if ( ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart() ) { // ok, we are going to keep this thing - Object obj = rec.getUnderlyingObject(); - if (!(type.isAssignableFrom(obj.getClass()))) - throw new UserException.CommandLineException("Unable to cast track named " + name + " to type of " + type.toString() - + " it's of type " + obj.getClass()); - - T objT = (T)obj; - if ( takeFirstOnly ) { - if ( values == null ) - values = Arrays.asList(objT); - else - values.add(objT); - - break; - } else { - if ( values == null ) - values = new ArrayList(); - values.add(objT); - } - } - } - - return values == null ? Collections.emptyList() : values; - } - - /** - * Finds the reference metadata track named 'name' and returns all ROD records from that track associated - * with the current site as a RODRecordList List object. If no data track with specified name is available, - * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up - * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and - * location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution, - * defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise: - * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, - * regardless of the presence of "extended" RODs overlapping with that location). - * @param name track name - * @return track data for the given rod - */ - private RODRecordList getTrackDataByName(final String name) { - final String luName = canonicalName(name); - RODRecordList l = bindings.get(luName); - return l == null ? EMPTY_ROD_RECORD_LIST : l; - } - - private RODRecordList getTrackDataByName(final RodBinding binding) { - return getTrackDataByName(binding.getName()); - } - - /** - * Returns the canonical name of the rod name (lowercases it) - * @param name the name of the rod - * @return canonical name of the rod - */ - private String canonicalName(final String name) { - // todo -- remove me after switch to RodBinding syntax - return name.toLowerCase(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceDependentFeatureCodec.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceDependentFeatureCodec.java deleted file mode 100644 index 9bff00dd8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceDependentFeatureCodec.java +++ /dev/null @@ -1,43 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -/** - * An interface marking that a given Tribble feature/codec is actually dependent on context within the - * reference, rather than having a dependency only on the contig, start, and stop of the given feature. - * A HACK. Tribble should contain all the information in needs to decode the unqualified position of - * a feature. - */ -public interface ReferenceDependentFeatureCodec { - /** - * Sets the appropriate GenomeLocParser, providing additional context when decoding larger and more variable features. - * @param genomeLocParser The parser to supply. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceOrderedDatum.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceOrderedDatum.java deleted file mode 100644 index 95de83208..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/ReferenceOrderedDatum.java +++ /dev/null @@ -1,66 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 27, 2009 - * Time: 10:49:47 AM - * To change this template use File | Settings | File Templates. - */ -public interface ReferenceOrderedDatum extends Comparable, HasGenomeLocation { - public String getName(); - public boolean parseLine(final Object header, final String[] parts) throws IOException; - public String toString(); - public String toSimpleString(); - public String repl(); - - /** - * Used by the ROD system to determine how to split input lines - * @return Regex string delimiter separating fields - */ - public String delimiterRegex(); - - public GenomeLoc getLocation(); - public int compareTo( ReferenceOrderedDatum that ); - - /** - * Backdoor hook to read header, meta-data, etc. associated with the file. Will be - * called by the ROD system before streaming starts - * - * @param source source data file on disk from which this rod stream will be pulled - * @return a header object that will be passed to parseLine command - */ - public Object initialize(final File source) throws FileNotFoundException; -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/SeekableRODIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/SeekableRODIterator.java deleted file mode 100644 index 4126214cf..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/SeekableRODIterator.java +++ /dev/null @@ -1,412 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.engine.iterators.PushbackIterator; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -/** - * Wrapper class for iterators over ROD objects. It is assumed that the underlying iterator can only - * perform standard next() operation, which advances it to the next ROD in the stream (i.e. reads the data file - * line by line). This iterator 1) shifts the focus from record-based traversal to position-based traversal, - * and 2) adds querying seekForward() method. - * - * Namely, this iterator's next() method advances not to the next ROD in the underlying stream, but to the next - * genomic position covered by (at least one) ROD, and returns all RODs overlapping with that position as a RODRecordList - * collection-like object. Similarly, when seekForward(interval) is called, this iterator skips all the RODs from the - * underlying stream, until it reaches specified genomic interval, and returns the list of all RODs overlapping with that interval. - * - * NOTE: this iterator has a STATE: next() operation is not allowed after a seekForward() to a non-point (extended) interval - * of length > 1. Such a call would leave the iterator in an inconsistent state. seekForward() can always be called after - * either seekForward() or next() (as long as usual ordering criteria are satisfied: the query interval location can neither - * start before the current position, nor end before the previous query end). seekForward to an interval of length 1 - * reenables next() operation. - * - * Created by IntelliJ IDEA. - * User: asivache - * Date: Sep 10, 2009 - * Time: 6:20:46 PM - * To change this template use File | Settings | File Templates. - */ -public class SeekableRODIterator implements LocationAwareSeekableRODIterator { - /** - * Header for the datasource backing this iterator. - */ - private final Object header; - - /** - * The parser, used to construct new genome locs. - */ - private final GenomeLocParser parser; - - private final SAMSequenceDictionary sequenceDictionary; - - private PushbackIterator it; - List records = null; // here we will keep a pile of records overlaping with current position; when we iterate - // and step out of record's scope, we purge it from the list - String name = null; // name of the ROD track wrapped by this iterator. Will be pulled from underlying iterator. - - int curr_position = 0; // where the iterator is currently positioned on the genome - int max_position = 0; // the rightmost stop position of currently loaded records - String curr_contig = null; // what contig the iterator is currently on - boolean next_is_allowed = true; // see discussion below. next() is illegal after seek-forward queries of length > 1 - - // the stop position of the last query. We can query only in forward direction ("seek forward"); - // it is not only the start position of every successive query that can not be before the start - // of the previous one (curr_start), but it is also illegal for a query interval to *end* before - // the end of previous query, otherwise we can end up in an inconsistent state - int curr_query_end = -1; - - // EXAMPLE of inconsistency curr_query_end guards against: - // record 1 record 2 - // ---------- ----------- - // -------------------------------------------------- REF - // ------------------------- query 1 (interval 1) - // ---------- query 2 (interval 2) - // --------------- query 3 - // - // If we query first for interval 1, both record 1 and record 2 will be loaded. - // Query for interval 2, on the other hand, should return only record 1, but after - // query 1 was performed, record 2 is already loaded from the file. If, on the other hand, - // we try to un-load it from memory, we won't be able to read it again. Hence query 2 is not - // allowed after query 1. Note also, that curr_query_end is not equivalent to max_position: - // the latter only tracks where currently loaded records end (and hence helps to re-load records); - // after query 1 is performed, max_position will be the end of record 2, but query 3 is still - // perfectly legal after query 1. - // - // IMPORTANT NOTE: it follows from the above discussion and example that next() is illegal after ANY - // seek-forward query EXCEPT those that are performed with length-1 intervals (queryInterval.start=queryinteval.stop). - // Indeed, in the example above, after, e.g., query 1 is performed, the iterator is "located" at the start - // of interval 1, but record1 and record 2 are already loaded. On the other hand, a subsequent call to next() would - // need to shift iterator's position by 1 base and return only record 1. - // - // This implementation tracks the query history and makes next() illegal after a seekforward query of length > 1, - // but re-enables next() again after a length-1 query. - - public SeekableRODIterator(Object header,SAMSequenceDictionary rodDictionary,SAMSequenceDictionary referenceDictionary,GenomeLocParser parser,CloseableIterator it) { - this.header = header; - this.parser = parser; - this.sequenceDictionary = rodDictionary; - this.it = new PushbackIterator(it); - records = new LinkedList(); - // the following is a trick: we would like the iterator to know the actual name assigned to - // the ROD implementing object we are working with. But the only way to do that is to - // get an instance of that ROD and query it for its name. Now, the only generic way we have at this point to instantiate - // the ROD is to make the underlying stream iterator to do it for us. So we are reading (or rather peeking into) - // the first line of the track data file just to get the ROD object created. - GATKFeature r = null; - if (this.it.hasNext()) r = this.it.element(); - name = (r==null?null:r.getName()); - - curr_contig = referenceDictionary.getSequence(0).getSequenceName(); - } - - /** - * Gets the header associated with the backing input stream. - * @return the ROD header. - */ - @Override - public Object getHeader() { - return header; - } - - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return sequenceDictionary; - } - - - /** - * Returns true if the data we iterate over has records associated with (any, not necessarily adjacent) - * genomic position farther along the reference. - * @return - */ - public boolean hasNext() { - - // if we did not walk to the very end of the interval(s) covered by currently loaded - // annotations (records), then we definitely have data for next genomic location - if ( curr_position < max_position ) return true; - - // we are past currently loaded stuff; we have next if there are more lines to load: - return it.hasNext(); - } - - // Returns point location (i.e. genome loc of length 1) on the reference, to which this iterator will advance - // upon next call to next(). - public GenomeLoc peekNextLocation() { - if ( curr_position + 1 <= max_position ) return parser.createGenomeLoc(curr_contig,curr_position+1); - - // sorry, next reference position is not covered by the RODs we are currently holding. In this case, - // the location we will jump to upon next call to next() is the start of the next ROD record that we did - // not read yet: - if ( it.hasNext() ) { - GATKFeature r = it.element(); // peek, do not load! - return parser.createGenomeLoc(r.getLocation().getContig(),r.getLocation().getStart()); - } - return null; // underlying iterator has no more records, there is no next location! - } - - /** Advances iterator to the next genomic position that has ROD record(s) associated with it, - * and returns all the records overlapping with that position as a RODList. The location of the whole - * RODList object will be set to the smallest interval subsuming genomic intervals of all returned records. - * Note that next() is disabled (will throw an exception) after seekForward() operation with query length > 1. - * @return list of all RODs overlapping with the next "covered" genomic position - */ - public RODRecordList next() { - if ( ! next_is_allowed ) - throw new ReviewedGATKException("Illegal use of iterator: Can not advance iterator with next() after seek-forward query of length > 1"); - - curr_position++; - // curr_query_end = -1; - - if ( curr_position <= max_position ) { - - // we still have bases covered by at least one currently loaded record; - // we have to purge only subset of records, on which we moved past the end - purgeOutOfScopeRecords(); - } else { - // ooops, we are past the end of all loaded records - kill them all at once, - // load next record and reinitialize by fastforwarding current position to the start of next record - records.clear(); - GATKFeature r = it.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe - records.add( r ); - curr_contig = r.getLocation().getContig(); - curr_position = r.getLocation().getStart(); - max_position = r.getLocation().getStop(); - } - - // current position is ste and at this point 'records' only keeps those annotations, on which we did not reach the end yet - // (we might have reloaded records completely if it was necessary); but we are not guaranteed yet that we - // hold ALL the records overlapping with the current position. Time to check if we just walked into the interval(s) - // covered by new records, so we need to load them too: - - while ( it.hasNext() ) { - GATKFeature r = it.element(); - if ( r == null ) { - it.next(); - continue; - } - - GenomeLoc currentContig = parser.createOverEntireContig(curr_contig); - GenomeLoc thatContig = r.getLocation(); - - if ( currentContig.isPast(thatContig) ) - throw new UserException("LocationAwareSeekableRODIterator: contig " +r.getLocation().getContig() + - " occurs out of order in track " + r.getName() ); - if ( currentContig.isBefore(thatContig) ) break; // next record is on a higher contig, we do not need it yet... - - if ( r.getLocation().getStart() < curr_position ) - throw new UserException("LocationAwareSeekableRODIterator: track "+r.getName() + - " is out of coordinate order on contig "+r.getLocation() + " compared to " + curr_contig + ":" + curr_position); - - if ( r.getLocation().getStart() > curr_position ) break; // next record starts after the current position; we do not need it yet - - r = it.next(); // we got here only if we do need next record, time to load it for real - - int stop = r.getLocation().getStop(); - if ( stop < curr_position ) throw new ReviewedGATKException("DEBUG: encountered contig that should have been loaded earlier"); // this should never happen - if ( stop > max_position ) max_position = stop; // max_position keeps the rightmost stop position across all loaded records - records.add(r); - } - - // 'records' and current position are fully updated. Last, we need to set the location of the whole track - // (collection of ROD records) to the genomic site we are currently looking at, and return the list - - return new RODRecordListImpl(name,records, parser.createGenomeLoc(curr_contig,curr_position)); - } - - /** - * Removes from the underlying collection the last element returned by the - * iterator (optional operation). This method can be called only once per - * call to next. The behavior of an iterator is unspecified if - * the underlying collection is modified while the iteration is in - * progress in any way other than by calling this method. - * - * @throws UnsupportedOperationException if the remove - * operation is not supported by this Iterator. - * @throws IllegalStateException if the next method has not - * yet been called, or the remove method has already - * been called after the last call to the next - * method. - */ - public void remove() { - throw new UnsupportedOperationException("LocationAwareSeekableRODIterator does not implement remove() operation"); - } - - - /** - * Returns the current "position" (not location!! ;) ) of this iterator. This method is used by the sharding - * system when it searches for available iterators in the pool that can be reused to resume traversal. - * When iterator is advanced using next(), current position - * is the same as 'location'. However, after a seekForward() query with extended interval, returned position - * will be set to the last position of the query interval, to disable (illegal) attempts to roll the iterator - * back and re-start traversal from current location. - * @return Current ending position of the iterator, or null if no position exists. - */ - public GenomeLoc position() { - if ( curr_contig == null ) return null; - if ( curr_query_end > curr_position ) { - // do not attempt to reuse this iterator if the position we need it for lies before the end of last query performed - return parser.createGenomeLoc(curr_contig,curr_query_end,curr_query_end); - } - else { - return parser.createGenomeLoc(curr_contig,curr_position); - } - } - - /** - * Seeks forward through the file until the specified interval is reached. - * The location object interval can be either a single point or an extended interval. All - * ROD records overlapping with the whole interval will be returned, or null if no such records exist. - * - * Query interval must start at or after the iterator's current location, or exception will be thrown. - * - * Query interval must end at or after the stop position of the previous query, if any, or an exception will - * be thrown: subsequent queries that end before the stop of previous ones are illegal. - * - * If seekForward() is performed to an extended (length > 1 i.e. start != stop) interval, next() operation becomes - * illegal (the iterator changes state). Only seekForward() calls are allowed thereafter, until a seekForward() call - * to a length-1 interval is performed, which re-enables next(). seekForward() queries with length-1 intervals can - * always be safely intermixed with next() (as long as ordering is respected and query intervals are at or after the - * current position). - * - * Note that in contrast to - * next() (which always advances current position of the iterator on the reference), this method scrolls - * forward ONLY if the specified interval is ahead of the current location of - * the iterator. However, if called again with the same 'interval' argument as before, seekForward will NOT - * advance, but will simply return the same ROD list as before. - * - * - * @param interval point-like genomic location to fastforward to. - * @return ROD object at (or overlapping with) the specified position, or null if no such ROD exists. - */ - public RODRecordList seekForward(GenomeLoc interval) { - - if ( interval.isBefore(parser.createOverEntireContig(curr_contig)) && - !(interval.getStart() == 0 && interval.getStop() == 0 && interval.getContig().equals(curr_contig)) ) // This criteria is syntactic sugar for 'seek to right before curr_contig' - throw new ReviewedGATKException("Out of order query: query contig "+interval.getContig()+" is located before "+ - "the iterator's current contig"); - if ( interval.getContig().equals(curr_contig) ) { - if ( interval.getStart() < curr_position ) - throw new ReviewedGATKException("Out of order query: query position "+interval +" is located before "+ - "the iterator's current position "+curr_contig + ":" + curr_position); - if ( interval.getStop() < curr_query_end ) - throw new ReviewedGATKException("Unsupported querying sequence: current query interval " + - interval+" ends before the end of previous query interval ("+curr_query_end+")"); - } - - curr_position = interval.getStart(); - curr_query_end = interval.getStop(); - - next_is_allowed = ( curr_position == curr_query_end ); // we can call next() later only if interval length is 1 - - if ( interval.getContig().equals(curr_contig) && curr_position <= max_position ) { - // some of the intervals we are currently keeping do overlap with the query interval - - purgeOutOfScopeRecords(); - } else { - // clean up and get ready for fast-forwarding towards the requested position - records.clear(); - max_position = -1; - curr_contig = interval.getContig(); - } - - // curr_contig and curr_position are set to where we asked to scroll to - - while ( it.hasNext() ) { - GATKFeature r = it.next(); - if ( r == null ) continue; - - GenomeLoc currentContig = parser.createOverEntireContig(curr_contig); - GenomeLoc thatContig = r.getLocation(); - - if ( currentContig.isPast(thatContig) ) continue; // did not reach requested contig yet - if ( currentContig.isBefore(thatContig) ) { - it.pushback(r); // next record is on the higher contig, we do not need it yet... - break; - } - - // we get here if we are on the requested contig: - - if ( r.getLocation().getStop() < curr_position ) continue; // did not reach the requested interval yet - - if ( r.getLocation().getStart() > curr_query_end ) { - // past the query interval - it.pushback(r); - break; - } - - // we get here only if interval of the record r overlaps with query interval, so the record should be loaded - if ( r.getLocation().getStop() > max_position ) max_position = r.getLocation().getStop(); - records.add(r); - } - - if ( records.size() > 0 ) { - return new RODRecordListImpl(name,records,interval); - } else { - return null; - } - - } - - /** - * Removes records that end before the curr_position from the list of currently kept records. This is a - * convenience (private) shortcut that does not perform extensive checking. In particular, it assumes that - * curr_position <= max_position, as well as that we are still on the same contig. - */ - private void purgeOutOfScopeRecords() { - Iterator i = records.iterator(); - while ( i.hasNext() ) { - GATKFeature r = i.next(); - if ( r.getLocation().getStop() < curr_position ) { - i.remove(); // we moved past the end of interval the record r is associated with, purge the record forever - } - } - - } - - @Override - public void close() { - if (this.it != null) ((CloseableIterator)this.it.getUnderlyingIterator()).close(); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/VariantContextAdaptors.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/VariantContextAdaptors.java deleted file mode 100644 index 82a826c10..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/VariantContextAdaptors.java +++ /dev/null @@ -1,399 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import htsjdk.samtools.util.SequenceUtil; -import htsjdk.tribble.Feature; -import htsjdk.tribble.annotation.Strand; -import htsjdk.tribble.dbsnp.OldDbSNPFeature; -import htsjdk.tribble.gelitext.GeliTextFeature; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.codecs.hapmap.RawHapMapFeature; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import htsjdk.variant.variantcontext.*; - -import java.util.*; - -/** - * A terrible but temporary approach to converting objects to VariantContexts. If you want to add a converter, - * you need to create a adaptor object here and register a converter from your class to this object. When tribble arrives, - * we'll use a better approach. - * - * To add a new converter: - * - * create a subclass of VCAdaptor, overloading the convert operator - * add it to the static map from input type -> converter where the input type is the object.class you want to convert - * - * That's it - * - * @author depristo@broadinstitute.org - */ -public class VariantContextAdaptors { - // -------------------------------------------------------------------------------------------------------------- - // - // Generic support routines. Do not modify - // - // -------------------------------------------------------------------------------------------------------------- - - private static Map,VCAdaptor> adaptors = new HashMap,VCAdaptor>(); - - static { - PluginManager vcAdaptorManager = new PluginManager(VCAdaptor.class); - List adaptorInstances = vcAdaptorManager.createAllTypes(); - for(VCAdaptor adaptor: adaptorInstances) - adaptors.put(adaptor.getAdaptableFeatureType(),adaptor); - } - - public static boolean canBeConvertedToVariantContext(Object variantContainingObject) { - return adaptors.containsKey(variantContainingObject.getClass()); - } - - /** generic superclass */ - public interface VCAdaptor { - /** - * Gets the type of feature that this adaptor can 'adapt' into a VariantContext. - * @return Type of adaptable feature. Must be a Tribble feature class. - */ - Class getAdaptableFeatureType(); - VariantContext convert(String name, Object input, ReferenceContext ref); - } - - public static VariantContext toVariantContext(String name, Object variantContainingObject, ReferenceContext ref) { - if ( ! adaptors.containsKey(variantContainingObject.getClass()) ) - return null; - else { - return adaptors.get(variantContainingObject.getClass()).convert(name, variantContainingObject, ref); - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // From here below you can add adaptor classes for new rods (or other types) to convert to VC - // - // -------------------------------------------------------------------------------------------------------------- - private static class VariantContextAdaptor implements VCAdaptor { - /** - * 'Null' adaptor; adapts variant contexts to variant contexts. - * @return VariantContext. - */ - @Override - public Class getAdaptableFeatureType() { return VariantContext.class; } - - // already a VC, just cast and return it - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - return (VariantContext)input; - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // dbSNP to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class DBSnpAdaptor implements VCAdaptor { - private static boolean isSNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); - } - - private static boolean isMNP(OldDbSNPFeature feature) { - return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); - } - - private static boolean isInsertion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("insertion"); - } - - private static boolean isDeletion(OldDbSNPFeature feature) { - return feature.getVariantType().contains("deletion"); - } - - private static boolean isIndel(OldDbSNPFeature feature) { - return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature); - } - - public static boolean isComplexIndel(OldDbSNPFeature feature) { - return feature.getVariantType().contains("in-del"); - } - - /** - * gets the alternate alleles. This method should return all the alleles present at the location, - * NOT including the reference base. This is returned as a string list with no guarantee ordering - * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest - * frequency). - * - * @return an alternate allele list - */ - public static List getAlternateAlleleList(OldDbSNPFeature feature) { - List ret = new ArrayList(); - for (String allele : getAlleleList(feature)) - if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); - return ret; - } - - /** - * gets the alleles. This method should return all the alleles present at the location, - * including the reference base. The first allele should always be the reference allele, followed - * by an unordered list of alternate alleles. - * - * @return an alternate allele list - */ - public static List getAlleleList(OldDbSNPFeature feature) { - List alleleList = new ArrayList(); - // add ref first - if ( feature.getStrand() == Strand.POSITIVE ) - alleleList = Arrays.asList(feature.getObserved()); - else - for (String str : feature.getObserved()) - alleleList.add(SequenceUtil.reverseComplement(str)); - if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase()) - && !alleleList.get(0).equals(feature.getNCBIRefBase()) ) - Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0); - - return alleleList; - } - - /** - * Converts non-VCF formatted dbSNP records to VariantContext. - * @return OldDbSNPFeature. - */ - @Override - public Class getAdaptableFeatureType() { return OldDbSNPFeature.class; } - - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - OldDbSNPFeature dbsnp = (OldDbSNPFeature)input; - - int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - - final byte refBaseForIndel = ref.getBases()[index]; - final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-"); - - boolean addPaddingBase; - if ( isSNP(dbsnp) || isMNP(dbsnp) ) - addPaddingBase = false; - else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) - addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); - else - return null; // can't handle anything else - - Allele refAllele; - if ( refBaseIsDash ) - refAllele = Allele.create(refBaseForIndel, true); - else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) - return null; - else - refAllele = Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + dbsnp.getNCBIRefBase(), true); - - final List alleles = new ArrayList(); - alleles.add(refAllele); - - // add all of the alt alleles - for ( String alt : getAlternateAlleleList(dbsnp) ) { - if ( Allele.wouldBeNullAllele(alt.getBytes())) - alt = ""; - else if ( ! Allele.acceptableAlleleBases(alt) ) - return null; - - alleles.add(Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + alt, false)); - } - - final VariantContextBuilder builder = new VariantContextBuilder(); - builder.source(name).id(dbsnp.getRsID()); - builder.loc(dbsnp.getChr(), dbsnp.getStart() - (addPaddingBase ? 1 : 0), dbsnp.getEnd() - (addPaddingBase && refAllele.length() == 1 ? 1 : 0)); - builder.alleles(alleles); - return builder.make(); - } - - private static List stripNullDashes(final List alleles) { - final List newAlleles = new ArrayList(alleles.size()); - for ( final String allele : alleles ) { - if ( allele.equals("-") ) - newAlleles.add(""); - else - newAlleles.add(allele); - } - return newAlleles; - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // GELI to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class GeliTextAdaptor implements VCAdaptor { - /** - * Converts Geli text records to VariantContext. - * @return GeliTextFeature. - */ - @Override - public Class getAdaptableFeatureType() { return GeliTextFeature.class; } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @param ref the reference context - * @return a VariantContext object - */ - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - GeliTextFeature geli = (GeliTextFeature)input; - if ( ! Allele.acceptableAlleleBases(String.valueOf(geli.getRefBase())) ) - return null; - Allele refAllele = Allele.create(String.valueOf(geli.getRefBase()), true); - - // make sure we can convert it - if ( geli.getGenotype().isHet() || !geli.getGenotype().containsBase(geli.getRefBase())) { - // add the reference allele - List alleles = new ArrayList(); - List genotypeAlleles = new ArrayList(); - // add all of the alt alleles - for ( char alt : geli.getGenotype().toString().toCharArray() ) { - if ( ! Allele.acceptableAlleleBases(String.valueOf(alt)) ) { - return null; - } - Allele allele = Allele.create(String.valueOf(alt), false); - if (!alleles.contains(allele) && !refAllele.basesMatch(allele.getBases())) alleles.add(allele); - - // add the allele, first checking if it's reference or not - if (!refAllele.basesMatch(allele.getBases())) genotypeAlleles.add(allele); - else genotypeAlleles.add(refAllele); - } - - Map attributes = new HashMap(); - Collection genotypes = new ArrayList(); - Genotype call = GenotypeBuilder.create(name, genotypeAlleles); - - // add the call to the genotype list, and then use this list to create a VariantContext - genotypes.add(call); - alleles.add(refAllele); - GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()); - return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make(); - } else - return null; // can't handle anything else - } - } - - // -------------------------------------------------------------------------------------------------------------- - // - // HapMap to VariantContext - // - // -------------------------------------------------------------------------------------------------------------- - - private static class HapMapAdaptor implements VCAdaptor { - /** - * Converts HapMap records to VariantContext. - * @return HapMapFeature. - */ - @Override - public Class getAdaptableFeatureType() { return RawHapMapFeature.class; } - - /** - * convert to a Variant Context, given: - * @param name the name of the ROD - * @param input the Rod object, in this case a RodGeliText - * @param ref the reference context - * @return a VariantContext object - */ - @Override - public VariantContext convert(String name, Object input, ReferenceContext ref) { - if ( ref == null ) - throw new UnsupportedOperationException("Conversion from HapMap to VariantContext requires a reference context"); - - RawHapMapFeature hapmap = (RawHapMapFeature)input; - - int index = hapmap.getStart() - ref.getWindow().getStart(); - if ( index < 0 ) - return null; // we weren't given enough reference context to create the VariantContext - - HashSet alleles = new HashSet(); - Allele refSNPAllele = Allele.create(ref.getBase(), true); - int deletionLength = -1; - - Map alleleMap = hapmap.getActualAlleles(); - // use the actual alleles, if available - if ( alleleMap != null ) { - alleles.addAll(alleleMap.values()); - Allele deletionAllele = alleleMap.get(RawHapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases) - if ( deletionAllele != null && deletionAllele.isReference() ) - deletionLength = deletionAllele.length(); - } else { - // add the reference allele for SNPs - alleles.add(refSNPAllele); - } - - // make a mapping from sample to genotype - String[] samples = hapmap.getSampleIDs(); - String[] genotypeStrings = hapmap.getGenotypes(); - - GenotypesContext genotypes = GenotypesContext.create(samples.length); - for ( int i = 0; i < samples.length; i++ ) { - // ignore bad genotypes - if ( genotypeStrings[i].contains("N") ) - continue; - - String a1 = genotypeStrings[i].substring(0,1); - String a2 = genotypeStrings[i].substring(1); - ArrayList myAlleles = new ArrayList(2); - - // use the mapping to actual alleles, if available - if ( alleleMap != null ) { - myAlleles.add(alleleMap.get(a1)); - myAlleles.add(alleleMap.get(a2)); - } else { - // ignore indels (which we can't handle without knowing the alleles) - if ( genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") ) - continue; - - Allele allele1 = Allele.create(a1, refSNPAllele.basesMatch(a1)); - Allele allele2 = Allele.create(a2, refSNPAllele.basesMatch(a2)); - - myAlleles.add(allele1); - myAlleles.add(allele2); - alleles.add(allele1); - alleles.add(allele2); - } - - Genotype g = GenotypeBuilder.create(samples[i], myAlleles); - genotypes.add(g); - } - - long end = hapmap.getEnd(); - if ( deletionLength > 0 ) - end += (deletionLength - 1); - VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).make(); - return vc; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/package-info.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/package-info.java deleted file mode 100644 index e9e9714cb..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/package-info.java +++ /dev/null @@ -1,26 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManager.java deleted file mode 100644 index d466f3f1e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManager.java +++ /dev/null @@ -1,280 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.NameAwareCodec; -import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import htsjdk.variant.vcf.AbstractVCFCodec; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.help.GATKDocUtils; - -import java.io.File; -import java.util.*; - - -/** - * Class for managing Tribble Feature readers available to the GATK. The features - * are dynamically determined via a PluginManager. This class provides convenient - * getter methods for obtaining FeatureDescriptor objects that collect all of the - * useful information about the Tribble Codec, Feature, and name in one place. - * - * @author depristo - */ -public class FeatureManager { - public static class FeatureDescriptor implements Comparable { - final String name; - final FeatureCodec codec; - - public FeatureDescriptor(final String name, final FeatureCodec codec) { - this.name = name; - this.codec = codec; - } - - public String getName() { - return name; - } - public String getSimpleFeatureName() { return getFeatureClass().getSimpleName(); } - public FeatureCodec getCodec() { - return codec; - } - public Class getCodecClass() { return codec.getClass(); } - public Class getFeatureClass() { return codec.getFeatureType(); } - - @Override - public String toString() { - return String.format("FeatureDescriptor name=%s codec=%s feature=%s", - getName(), getCodecClass().getName(), getFeatureClass().getName()); - } - - @Override - public int compareTo(FeatureDescriptor o) { - return getName().compareTo(o.getName()); - } - } - - private final PluginManager pluginManager; - private final Collection featureDescriptors = new TreeSet(); - private final boolean lenientVCFProcessing; - - /** - * Construct a FeatureManager without a master VCF header - */ - public FeatureManager() { - this(false); - } - - public FeatureManager(final boolean lenientVCFProcessing) { - this.lenientVCFProcessing = lenientVCFProcessing; - pluginManager = new PluginManager(FeatureCodec.class, "Codecs", "Codec"); - - for (final String rawName: pluginManager.getPluginsByName().keySet()) { - FeatureCodec codec = pluginManager.createByName(rawName); - String name = rawName.toUpperCase(); - FeatureDescriptor featureDescriptor = new FeatureDescriptor(name, codec); - featureDescriptors.add(featureDescriptor); - } - } - - /** - * Return the FeatureDescriptor whose getCodecClass().equals(codecClass). - * - * @param codecClass - * @return A FeatureDescriptor or null if none is found - */ - @Requires("codecClass != null") - public FeatureDescriptor getByCodec(Class codecClass) { - for ( FeatureDescriptor descriptor : featureDescriptors ) - if ( descriptor.getCodecClass().equals(codecClass) ) - return descriptor; - return null; - } - - /** - * Returns a collection of FeatureDescriptors that emit records of type featureClass - * - * @param featureClass - * @return A FeatureDescriptor or null if none is found - */ - @Requires("featureClass != null") - public Collection getByFeature(Class featureClass) { - Set consistentDescriptors = new TreeSet(); - - if (featureClass == null) - throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object"); - - for ( FeatureDescriptor descriptor : featureDescriptors ) { - if ( featureClass.isAssignableFrom(descriptor.getFeatureClass())) - consistentDescriptors.add(descriptor); - } - return consistentDescriptors; - } - - /** - * Return the FeatureDescriptor with getID().equals(name) - * - * @param name - * @return A FeatureDescriptor or null if none is found - */ - @Requires("name != null") - public FeatureDescriptor getByName(String name) { - for ( FeatureDescriptor descriptor : featureDescriptors ) - if ( descriptor.getName().equalsIgnoreCase(name) ) - return descriptor; - return null; - } - - /** - * Returns the FeatureDescriptor that can read the contexts of File file, is one can be determined - * - * @param file - * @return A FeatureDescriptor or null if none is found - */ - @Requires({"file != null", "file.isFile()", "file.canRead()"}) - public FeatureDescriptor getByFiletype(File file) { - List canParse = new ArrayList(); - for ( FeatureDescriptor descriptor : featureDescriptors ) - if ( descriptor.getCodec().canDecode(file.getPath()) ) { - canParse.add(descriptor); - } - - if ( canParse.size() == 0 ) - return null; - else if ( canParse.size() > 1 ) - throw new ReviewedGATKException("BUG: multiple feature descriptors can read file " + file + ": " + canParse); - else - return canParse.get(0); - } - - /** - * Returns the FeatureDescriptor associated with the type described by triplet, or null if none is found - * @param triplet - * @return - */ - @Requires("triplet != null") - public FeatureDescriptor getByTriplet(RMDTriplet triplet) { - return getByName(triplet.getType()); - } - - /** - * @return all of the FeatureDescriptors available to the GATK. Never null - */ - @Ensures("result != null") - public Collection getFeatureDescriptors() { - return Collections.unmodifiableCollection(featureDescriptors); - } - - - /** - * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load - * @return - */ - @Ensures("result != null") - public String userFriendlyListOfAvailableFeatures() { - return userFriendlyListOfAvailableFeatures(Feature.class); - } - - /** - * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load - * restricted to only Codecs producting Features consistent with the requiredFeatureType - * @return - */ - @Ensures("result != null") - public String userFriendlyListOfAvailableFeatures(Class requiredFeatureType) { - final String nameHeader="Name", featureHeader = "FeatureType", docHeader="Documentation"; - - int maxNameLen = nameHeader.length(), maxFeatureNameLen = featureHeader.length(); - for ( final FeatureDescriptor descriptor : featureDescriptors ) { - if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { - maxNameLen = Math.max(maxNameLen, descriptor.getName().length()); - maxFeatureNameLen = Math.max(maxFeatureNameLen, descriptor.getSimpleFeatureName().length()); - } - } - - StringBuilder docs = new StringBuilder(); - String format = "%" + maxNameLen + "s %" + maxFeatureNameLen + "s %s%n"; - docs.append(String.format(format, nameHeader, featureHeader, docHeader)); - for ( final FeatureDescriptor descriptor : featureDescriptors ) { - if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { - final String DocURL = GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass()); - final String oneDoc; - if ( DocURL.contains("_sting_") ) { - oneDoc = String.format(format, - descriptor.getName(), - descriptor.getSimpleFeatureName(), - DocURL); - } else { - oneDoc = String.format(format, - descriptor.getName(), - descriptor.getSimpleFeatureName(), - "(this is an external codec and is not documented within GATK)"); - } - - docs.append(oneDoc); - } - } - - return docs.toString(); - } - - /** - * Create a new FeatureCodec of the type described in descriptor, assigning it the - * name (if possible) and providing it the genomeLocParser (where necessary) - * - * @param descriptor FeatureDescriptor of the Tribble FeatureCodec we want to create - * @param name the name to assign this codec - * @param genomeLocParser GenomeLocParser for ReferenceDependentFeatureCodecs - * @param remappedSampleName replacement sample name for single-sample vcfs, or null if we're not performing - * sample name remapping - * @return the feature codec itself - */ - @Requires({"descriptor != null", "name != null", "genomeLocParser != null"}) - @Ensures("result != null") - public FeatureCodec createCodec(final FeatureDescriptor descriptor, final String name, final GenomeLocParser genomeLocParser, - final String remappedSampleName) { - FeatureCodec codex = pluginManager.createByType(descriptor.getCodecClass()); - if ( codex instanceof NameAwareCodec ) - ((NameAwareCodec)codex).setName(name); - if ( codex instanceof ReferenceDependentFeatureCodec ) - ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); - if ( codex instanceof AbstractVCFCodec ) { - if ( lenientVCFProcessing ) { - ((AbstractVCFCodec)codex).disableOnTheFlyModifications(); - } - if ( remappedSampleName != null ) { - ((AbstractVCFCodec)codex).setRemappedSampleName(remappedSampleName); - } - } - - return codex; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/IndexDictionaryUtils.java deleted file mode 100644 index 5c18d3a8e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/IndexDictionaryUtils.java +++ /dev/null @@ -1,114 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.index.MutableIndex; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.utils.SequenceDictionaryUtils; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -/** - * Utilities for working with Sequence Dictionaries embedded in tribble indices - * - * @author Your Name - * @since Date created - */ -public class IndexDictionaryUtils { - private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); - - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - static SAMSequenceDictionary createSequenceDictionaryFromContigList(final Index index, final SAMSequenceDictionary dict) { - final List seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (final String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - - /** - * Sets the sequence dictionary of the given index. THE INDEX MUST BE MUTABLE (i.e. not Tabix). - * - * @param index the (mutable) index file to use - * @param dict the dictionary to use - */ - public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { - for ( SAMSequenceRecord seq : dict.getSequences() ) { - final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); - final String length = String.valueOf(seq.getSequenceLength()); - ((MutableIndex)index).addProperty(contig, length); - } - } - - public static void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict, - final ValidationExclusion.TYPE validationExclusionType ) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrack.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrack.java deleted file mode 100644 index 51cb8f443..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrack.java +++ /dev/null @@ -1,147 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.util.CloseableIterator; -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.CloseableTribbleIterator; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.engine.refdata.utils.FeatureToGATKFeatureIterator; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.IOException; - - -/** - * @author aaron - *

- * Class RMDTrack - *

- * the basics of what a reference metadata track must contain. - */ -public class RMDTrack { - private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); - - // the basics of a track: - private final Class type; // our type - private final String name; // the name - private final File file; // the associated file we create the reader from - - // our feature reader - allows queries - private AbstractFeatureReader reader; - - // our sequence dictionary, which can be null - private final SAMSequenceDictionary dictionary; - - /** - * Parser to use when creating/parsing GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - - // our codec type - private final FeatureCodec codec; - - public Class getType() { - return type; - } - - public String getName() { - return name; - } - - public File getFile() { - return file; - } - - /** - * Create a track - * - * @param type the type of track, used for track lookup - * @param name the name of this specific track - * @param file the associated file, for reference or recreating the reader - * @param reader the feature reader to use as the underlying data source - * @param dict the sam sequence dictionary - * @param codec the feature codec we use to decode this type - */ - public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) { - this.type = type; - this.name = name; - this.file = file; - this.reader = reader; - this.dictionary = dict; - this.genomeLocParser = genomeLocParser; - this.codec = codec; - } - - /** - * @return how to get an iterator of the underlying data. This is all a track has to support, - * but other more advanced tracks support the query interface - */ - public CloseableIterator getIterator() { - try { - return new FeatureToGATKFeatureIterator(genomeLocParser,reader.iterator(),this.getName()); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(getFile(), "Unable to read from file", e); - } - } - - public CloseableIterator query(GenomeLoc interval) throws IOException { - CloseableTribbleIterator iter = reader.query(interval.getContig(),interval.getStart(),interval.getStop()); - return new FeatureToGATKFeatureIterator(genomeLocParser, iter, this.getName()); - } - - public void close() { - try { - reader.close(); - } catch (IOException e) { - throw new UserException.MalformedFile("Unable to close reader " + reader.toString(),e); - } - reader = null; - } - - /** - * get the sequence dictionary from the track, if available - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public SAMSequenceDictionary getSequenceDictionary() { - return dictionary; - } - - public Object getHeader() { - return reader.getHeader(); - } - - public FeatureCodec getCodec() { - return codec; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilder.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilder.java deleted file mode 100644 index dc9e96728..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilder.java +++ /dev/null @@ -1,430 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.Tribble; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.index.IndexFactory; -import htsjdk.tribble.util.LittleEndianOutputStream; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet.RMDStorageType; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.file.FSLockWithShared; -import org.broadinstitute.gatk.utils.instrumentation.Sizeof; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.Map; - - -/** - * - * @author aaron - * ` - * Class RMDTrackBuilder - * - * This class keeps track of the available codecs, and knows how to put together a track of - * that gets iterators from the FeatureReader using Tribble. - * - */ -public class RMDTrackBuilder { // extends PluginManager { - /** - * our log, which we use to capture anything from this class - */ - private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); - - // private sequence dictionary we use to set our tracks with - private final SAMSequenceDictionary dict; - - /** - * Private genome loc parser to use when building out new locs. - */ - private final GenomeLocParser genomeLocParser; - - /** - * Validation exclusions, for validating the sequence dictionary. - */ - private ValidationExclusion.TYPE validationExclusionType; - - private final FeatureManager featureManager; - - // If true, do not attempt to create index files if they don't exist or are outdated, and don't - // make any file lock acquisition calls on the index files. - private final boolean disableAutoIndexCreation; - - // Map of file name -> new sample name used when performing on-the-fly sample renaming - private final Map sampleRenameMap; - - /** - * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally - * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, - * please talk through your approach with the SE team. - * @param dict Sequence dictionary to use. - * @param genomeLocParser Location parser to use. - * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. - * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. - * UNSAFE in general (because it causes us not to lock index files before reading them) -- - * suitable only for test suite use. - * @param sampleRenameMap Map of file name -> new sample name used when performing on-the-fly sample renaming - */ - public RMDTrackBuilder(final SAMSequenceDictionary dict, - final GenomeLocParser genomeLocParser, - final ValidationExclusion.TYPE validationExclusionType, - final boolean disableAutoIndexCreation, - final Map sampleRenameMap) { - this.dict = dict; - this.validationExclusionType = validationExclusionType; - this.genomeLocParser = genomeLocParser; - this.featureManager = new FeatureManager(GenomeAnalysisEngine.lenientVCFProcessing(validationExclusionType)); - this.disableAutoIndexCreation = disableAutoIndexCreation; - this.sampleRenameMap = sampleRenameMap; - } - - /** - * Return the feature manager this RMDTrackBuilder is using the create tribble tracks - * - * @return - */ - public FeatureManager getFeatureManager() { - return featureManager; - } - - /** - * create a RMDTrack of the specified type - * - * @param fileDescriptor a description of the type of track to build. - * - * @return an instance of the track - */ - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { - String name = fileDescriptor.getName(); - File inputFile = new File(fileDescriptor.getFile()); - - FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); - if (descriptor == null) - throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); - - // return a feature reader track - Pair pair; - if (VCFWriterArgumentTypeDescriptor.isCompressed(inputFile.toString())) - pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); - else - pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); - if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); - return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile)); - } - - /** - * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. - * @param codecClass Type of Tribble codec class to build. - * @param inputFile Input file type to use. - * @return An RMDTrack, suitable for accessing reference metadata. - */ - public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { - final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); - - if (descriptor == null) - throw new ReviewedGATKException("Unable to find type name for codec class " + codecClass.getName()); - - return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); - } - - /** - * create a feature reader, without assuming there exists an index. This code assumes the feature - * reader of the appropriate type will figure out what the right index type is, and determine if it - * exists. - * - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the track - * @param inputFile the file to load - * @return a feature reader implementation - */ - private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { - // we might not know the index type, try loading with the default reader constructor - logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); - try { - // getFeatureReader will detect that it's Tabix - return new Pair<>(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile)), null); - } catch (TribbleException e) { - throw new UserException(e.getMessage(), e); - } - } - - /** - * add a name to the codec, if it takes one - * @param descriptor the class to create a codec for - * @param name the name to assign this codec - * @param inputFile input file that we will be decoding - * @return the feature codec itself - */ - private FeatureCodec createCodec(final FeatureManager.FeatureDescriptor descriptor, final String name, final File inputFile) { - // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, - // or the user's sample rename map file didn't contain an entry for this file: - final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(inputFile.getAbsolutePath()) : null; - - return featureManager.createCodec(descriptor, name, genomeLocParser, remappedSampleName); - } - - /** - * create a feature source object given: - * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create - * @param name the name of the codec - * @param inputFile the tribble file to parse - * @param storageType How the RMD is streamed into the input file. - * @return the input file as a FeatureReader - */ - private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, - String name, - File inputFile, - RMDStorageType storageType) { - // Feature source and sequence dictionary to use as the ultimate reference - AbstractFeatureReader featureSource = null; - SAMSequenceDictionary sequenceDictionary = null; - - // Detect whether or not this source should be indexed. - boolean canBeIndexed = (storageType == RMDStorageType.FILE); - - if(canBeIndexed) { - try { - Index index = loadIndex(inputFile, createCodec(descriptor, name, inputFile)); - try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } - catch (ReviewedGATKException e) { } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - - // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match - if (sequenceDictionary.size() == 0 && dict != null) { - validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); - - if ( ! disableAutoIndexCreation ) { - File indexFile = Tribble.indexFile(inputFile); - try { // re-write the index - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); - } - } - - sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - } - - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile), index); - } - catch (TribbleException e) { - throw new UserException(e.getMessage()); - } - catch (IOException e) { - throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); - } - } - else { - featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile), false); - } - - return new Pair(featureSource,sequenceDictionary); - } - - /** - * create an index for the input file - * @param inputFile the input file - * @param codec the codec to use - * @return a linear index for the specified type - * @throws IOException if we cannot write the index file - */ - public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { - final File indexFile = Tribble.indexFile(inputFile); - final FSLockWithShared lock = new FSLockWithShared(indexFile); - Index idx = null; - - // If the index file exists and is readable, attempt to load it from disk. We'll get null back - // if a problem was discovered with the index file when it was inspected, and we'll get an - // in-memory index back in the case where the index file could not be locked. - if (indexFile.canRead()) { - idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode - : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); - } - - // If we have an index, it means we either loaded it from disk without issue or we created an in-memory - // index due to not being able to acquire a lock. - if (idx != null) return idx; - - // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index - idx = createIndexInMemory(inputFile, codec); - if ( ! disableAutoIndexCreation ) { - writeIndexToDisk(idx, indexFile, lock); - } - return idx; - } - - /** - * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if - * a lock could not be obtained. Returns null if a problem was discovered with the index file when it - * was examined (eg., it was out-of-date). - * - * @param inputFile the input file - * @param codec the codec to read from - * @param indexFile the index file itself - * @param lock the lock file - * @return an index, or null if we couldn't load one - * @throws IOException if we fail for FS issues - */ - protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { - boolean locked = false; - Index idx = null; - - try { - locked = lock.sharedLock(); - - if ( ! locked ) { // can't lock file - logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", - indexFile.getAbsolutePath())); - idx = createIndexInMemory(inputFile, codec); - } - else { - idx = loadFromDisk(inputFile, indexFile); - } - } finally { - if (locked) lock.unlock(); - } - return idx; - } - - /** - * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) - * @param inputFile the input file - * @param indexFile the input file, plus the index extension - * @return an Index, or null if we're unable to load - */ - protected Index loadFromDisk( final File inputFile, final File indexFile ) { - logger.debug("Loading Tribble index from disk for file " + inputFile); - Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - - // check if the file is up-to date (filestamp and version check) - if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) - return index; - else if (indexFile.lastModified() < inputFile.lastModified()) - logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable - logger.warn("Index file " + indexFile + " is out of date (old version), " + - (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); - - if ( ! disableAutoIndexCreation ) { - boolean deleted = indexFile.delete(); - if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); - } - - return null; - } - - - /** - * attempt to write the index to disk - * @param index the index to write to disk - * @param indexFile the index file location - * @param lock the locking object - * @throws IOException when unable to create the new index - */ - private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { - if ( disableAutoIndexCreation ) { - return; - } - - boolean locked = false; - - try { - locked = lock.exclusiveLock(); - - if (locked) { - logger.info("Writing Tribble index to disk for file " + indexFile); - LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); - index.write(stream); - stream.close(); - } - else // we can't write it to disk, just store it in memory, tell them this - logger.warn("Unable to write to " + indexFile + " for the index file, creating index in memory only"); - - try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } - catch ( ReviewedGATKException e) { } - } - finally { - if (locked) lock.unlock(); - } - - } - - /** - * create the index in memory, given the input file and feature codec - * @param inputFile the input file - * @param codec the codec - * @return a LinearIndex, given the file location - * @throws IOException when unable to create the index in memory - */ - protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { - // this can take a while, let them know what we're doing - logger.debug("Creating Tribble index in memory for file " + inputFile); - Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); - return idx; - } - - /** - * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. - * (that each contig in the index is in the sequence dictionary). - * @param inputFile for proper error message formatting. - * @param dict the sequence dictionary - * @param index the index file - */ - public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { - if (dict == null) throw new ReviewedGATKException("BUG: dict cannot be null"); - - // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); - validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); - - // actually update the dictionary in the index - IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); - } - - public void validateTrackSequenceDictionary(final String trackName, - final SAMSequenceDictionary trackDict, - final SAMSequenceDictionary referenceDict ) { - IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIterator.java deleted file mode 100644 index 6fb073e12..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIterator.java +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.tribble.CloseableTribbleIterator; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.GenomeLocParser; - - -/** - * - * @author aaron - * - * Class FeatureToGATKFeatureIterator - * - * a wrapper on Tribble feature iterators so that they produce GATKFeatures (which produce GenomeLocs) - */ -public class FeatureToGATKFeatureIterator implements CloseableIterator { - private final GenomeLocParser genomeLocParser; - private final CloseableTribbleIterator iterator; - private final String name; - - public FeatureToGATKFeatureIterator(GenomeLocParser genomeLocParser,CloseableTribbleIterator iter, String name) { - this.genomeLocParser = genomeLocParser; - this.name = name; - this.iterator = iter; - } - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public GATKFeature next() { - return new GATKFeature.TribbleGATKFeature(genomeLocParser,iterator.next(),name); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Why does Iterator have this method? We always throw an exception here"); - } - - @Override - public void close() { - // The private adapted iterator may not be passed on by the method constructing this object, - // leaving only this adapter to close the wrapped iterator. - iterator.close(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIterator.java deleted file mode 100644 index 8fc549c00..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIterator.java +++ /dev/null @@ -1,221 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.util.Comparator; -import java.util.LinkedList; - - -/** - * - * @author aaron - * - * Class FlashBackIterator - * - * better than acid washed jeans...more like a Delorean that flies through time - * - * This iterator buffers a certain amount of ROD data to 'flash back' to. This - * is needed for using ROD's in read traversals, because between shards we sometimes - * (actually often) need to go back to before the current iterators location and - * get RODs that overlap the current read. - */ -public class FlashBackIterator implements LocationAwareSeekableRODIterator { - private LocationAwareSeekableRODIterator iterator; - private LinkedList pastQueue = new LinkedList(); - private LinkedList aheadQueue = new LinkedList(); - private int MAX_QUEUE = 200; - - /** - * create a flashback iterator - * @param iterator given a LocationAwareSeekableRODIterator - */ - public FlashBackIterator(LocationAwareSeekableRODIterator iterator) { - this.iterator = iterator; - } - - /** - * Gets the header associated with the backing input stream. - * @return the ROD header. - */ - @Override - public Object getHeader() { - return iterator.getHeader(); - } - - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return iterator.getSequenceDictionary(); - } - - - /** - * peek at the next location - * @return - */ - @Override - public GenomeLoc peekNextLocation() { - return (aheadQueue.size() > 0) ? aheadQueue.getFirst().getLocation() : iterator.peekNextLocation(); - } - - /** - * get the position of this iterator - * @return - */ - @Override - public GenomeLoc position() { - return (aheadQueue.size() > 0) ? aheadQueue.getFirst().getLocation() : iterator.position(); - } - - /** - * seek forward on the iterator - * @param interval the interval to seek to - * @return a RODRecordList at that location, null otherwise - */ - @Override - public RODRecordList seekForward(GenomeLoc interval) { - - RODRecordList lt = iterator.seekForward(interval); - createPastRecord(lt); - return lt; - } - - /** - * do we have a next record - * @return true if we have another record - */ - @Override - public boolean hasNext() { - return (aheadQueue.size() > 0 || iterator.hasNext()); - } - - /** - * get the next record - * @return a RODRecordList - */ - @Override - public RODRecordList next() { - return getNext(); - } - - /** - * we don't support remove - */ - @Override - public void remove() { - throw new UnsupportedOperationException("We don't support remove"); - } - - /** - * get the next record, either from the queue or from the iterator - * @return a RODRecordList - */ - private RODRecordList getNext() { - if (aheadQueue.size() > 0) { - RODRecordList ret = aheadQueue.getFirst().getList(); - aheadQueue.removeFirst(); - return ret; - } else { - RODRecordList ret = iterator.next(); - createPastRecord(ret); - return ret; - } - } - - private void createPastRecord(RODRecordList ret) { - ComparableList rec = new ComparableList(ret); - if (rec.getLocation() != null) pastQueue.addLast(new ComparableList(ret)); - if (pastQueue.size() > this.MAX_QUEUE) pastQueue.removeFirst(); - } - - /** - * can we flash back to the specified location? - * - * @param location the location to try and flash back to - * - * @return true if we can, false otherwise - */ - public boolean canFlashBackTo(GenomeLoc location) { - GenomeLoc farthestBack = (pastQueue.size() > 0) ? pastQueue.getFirst().getLocation() : iterator.peekNextLocation(); - return (!farthestBack.isPast(location)); - } - - /** - * flashback! Throws an unsupported operation exception - * - * @param location where to flash back to - */ - public void flashBackTo(GenomeLoc location) { - if (!canFlashBackTo(location)) throw new UnsupportedOperationException("we can't flash back to " + location); - if (pastQueue.size()==0) return; // the iterator can do it alone - while (pastQueue.size() > 0 && !pastQueue.getLast().getLocation().isBefore(location)) { - aheadQueue.addFirst(pastQueue.getLast()); - pastQueue.removeLast(); - } - } - - public void close() { - this.aheadQueue.clear(); - this.pastQueue.clear(); - } -} - -/** - * a list that buffers the location for this rod - */ -class ComparableList implements Comparator, HasGenomeLocation { - private RODRecordList list; - private GenomeLoc location = null; - public ComparableList(RODRecordList list) { - this.list = list; - if (list != null && list.size() != 0) - location = list.getLocation(); - } - - @Override - public int compare(ComparableList list1, ComparableList list2) { - if (list1.location == null && list2.location == null) - return 0; - if (list1.location == null) return 1; - if (list2.location == null) return -1; - return (list1.location.compareTo(list2.location)); - } - - public GenomeLoc getLocation() { - return location; - } - - public RODRecordList getList() { - return list; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/GATKFeature.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/GATKFeature.java deleted file mode 100644 index 4d08f1bca..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/GATKFeature.java +++ /dev/null @@ -1,109 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.engine.refdata.ReferenceOrderedDatum; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - - -/** - * - * @author aaron - * - * Class GATKFeature - * - * This wraps a Tribble feature or a RODatum so that both present the same interface: a genome loc for position and a - * way of retrieving the track name. - */ -public abstract class GATKFeature implements Feature, HasGenomeLocation { - - public GATKFeature(String name) { - this.name = name; - } - - String name; - - protected void setName(String name) { - this.name = name; - } - - public String getName() { - return name; - } - - public abstract GenomeLoc getLocation(); - - // TODO: this should be a Feature - public abstract Object getUnderlyingObject(); - - /** - * wrapping a Tribble feature in a GATK friendly interface - */ - public static class TribbleGATKFeature extends GATKFeature { - private final GenomeLocParser genomeLocParser; - private final Feature feature; - private GenomeLoc position = null; - - public TribbleGATKFeature(GenomeLocParser genomeLocParser,Feature f, String name) { - super(name); - this.genomeLocParser = genomeLocParser; - feature = f; - } - public GenomeLoc getLocation() { - if (position == null) position = genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()); - return position; - } - - /** Return the features reference sequence name, e.g chromosome or contig */ - @Override - public String getChr() { - return feature.getChr(); - } - - /** Return the start position in 1-based coordinates (first base is 1) */ - @Override - public int getStart() { - return feature.getStart(); - } - - /** - * Return the end position following 1-based fully closed conventions. The length of a feature is - * end - start + 1; - */ - @Override - public int getEnd() { - return feature.getEnd(); - } - - // TODO: this should be a Feature, actually - public Object getUnderlyingObject() { - return feature; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/LocationAwareSeekableRODIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/LocationAwareSeekableRODIterator.java deleted file mode 100644 index 96c60b9d8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/LocationAwareSeekableRODIterator.java +++ /dev/null @@ -1,49 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; - -/** - * @author aaron - *

- * Interface LocationAwareSeekableRODIterator - *

- * combine iteration with a position aware interface - */ -public interface LocationAwareSeekableRODIterator extends CloseableIterator { - public Object getHeader(); - - public SAMSequenceDictionary getSequenceDictionary(); - - public GenomeLoc peekNextLocation(); - - public GenomeLoc position(); - - public RODRecordList seekForward(GenomeLoc interval); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RMDTriplet.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RMDTriplet.java deleted file mode 100644 index 9fa3d1e11..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RMDTriplet.java +++ /dev/null @@ -1,92 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - - -import org.broadinstitute.gatk.utils.commandline.Tags; - -/** - * a helper class to manage our triplets of data for the -B command line option (name, type, file) - * TODO: The presence of four datapoints here suggests that this class' name isn't sufficient to describe its function. Rename. - */ -public class RMDTriplet { - public enum RMDStorageType { FILE, STREAM }; - - private final String name; - private final String type; - private final String file; - private final RMDStorageType storageType; - private final Tags tags; - - public RMDTriplet(final String name, final String type, final String file, final RMDStorageType storageType, final Tags tags) { - this.name = name; - this.type = type; - this.file = file; - this.storageType = storageType; - this.tags = tags; - } - - /** - * Gets the name of this track. RefMetaDataTrackers can use this identifier to retrieve data of a certain type. - * @return Name associated with this track. - */ - public String getName() { - return name; - } - - /** - * Gets the type of this track. Informs the GATK how to parse this file type. - * @return Type associated with this track. - */ - public String getType() { - return type; - } - - /** - * Gets the filename representing this track. Data is loaded from this file. - * @return Filename of the RMD. - */ - public String getFile() { - return file; - } - - /** - * The type of storage being used for this metadata track. Right now, can be either a - * file type (can be indexed) or a stream type (can't be indexed). - * @return Storage type for this RMD 'triplet'. - */ - public RMDStorageType getStorageType() { - return storageType; - } - - /** - * Gets the key=value tags associated with this track - * @return Tags associated with this track. - */ - public Tags getTags() { - return tags; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RODRecordList.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RODRecordList.java deleted file mode 100644 index b859edc10..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/refdata/utils/RODRecordList.java +++ /dev/null @@ -1,45 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.HasGenomeLocation; - -import java.util.List; - - -/** - * @author aaron - *

- * Class RODRecordList - *

- * make the RODRecord list an interface, so we can stub in other implementations - * during testing. - */ -public interface RODRecordList extends List, Comparable, HasGenomeLocation { - public GenomeLoc getLocation(); - public String getName(); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReport.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReport.java deleted file mode 100644 index 660ea95c1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReport.java +++ /dev/null @@ -1,376 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.*; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -/** - * Container class for GATK report tables - */ -public class GATKReport { - public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport."; - public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_1; - private static final String SEPARATOR = ":"; - private GATKReportVersion version = LATEST_REPORT_VERSION; - - private final TreeMap tables = new TreeMap(); - - /** - * Create a new, empty GATKReport. - */ - public GATKReport() { - } - - /** - * Create a new GATKReport with the contents of a GATKReport on disk. - * - * @param filename the path to the file to load - */ - public GATKReport(String filename) { - this(new File(filename)); - } - - /** - * Create a new GATKReport with the contents of a GATKReport on disk. - * - * @param file the file to load - */ - public GATKReport(File file) { - loadReport(file); - } - - /** - * Create a new GATK report from GATK report tables - * @param tables Any number of tables that you want to add to the report - */ - public GATKReport(GATKReportTable... tables) { - for( GATKReportTable table: tables) - addTable(table); - } - - /** - * Load a GATKReport file from disk - * - * @param file the file to load - */ - private void loadReport(File file) { - BufferedReader reader; - String reportHeader; - try { - reader = new BufferedReader(new FileReader(file)); - reportHeader = reader.readLine(); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(file, "it does not exist"); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - - - // Read the first line for the version and number of tables. - version = GATKReportVersion.fromHeader(reportHeader); - if (version.equals(GATKReportVersion.V0_1) || - version.equals(GATKReportVersion.V0_2)) - throw new UserException("The GATK no longer supports reading legacy GATK Reports. Please use v1.0 or newer."); - - int nTables = Integer.parseInt(reportHeader.split(":")[2]); - - // Read each table according ot the number of tables - for (int i = 0; i < nTables; i++) { - addTable(new GATKReportTable(reader, version)); - } - } - - /** - * Add a new, empty table to the report - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - */ - public void addTable(final String tableName, final String tableDescription, final int numColumns) { - addTable(tableName, tableDescription, numColumns, GATKReportTable.TableSortingWay.DO_NOT_SORT); - } - - /** - * Add a new, empty table to the report - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - * @param sortingWay way to sort table - */ - public void addTable(final String tableName, final String tableDescription, final int numColumns, final GATKReportTable.TableSortingWay sortingWay) { - GATKReportTable table = new GATKReportTable(tableName, tableDescription, numColumns, sortingWay); - tables.put(tableName, table); - } - - /** - * Adds a table, empty or populated, to the report - * - * @param table the table to add - */ - public void addTable(GATKReportTable table) { - tables.put(table.getTableName(), table); - } - - public void addTables(List gatkReportTableV2s) { - for ( GATKReportTable table : gatkReportTableV2s ) - addTable(table); - } - - /** - * Return true if table with a given name exists - * - * @param tableName the name of the table - * @return true if the table exists, false otherwise - */ - public boolean hasTable(String tableName) { - return tables.containsKey(tableName); - } - - /** - * Return a table with a given name - * - * @param tableName the name of the table - * @return the table object - */ - public GATKReportTable getTable(String tableName) { - GATKReportTable table = tables.get(tableName); - if (table == null) - throw new ReviewedGATKException("Table is not in GATKReport: " + tableName); - return table; - } - - /** - * Print all tables contained within this container to a PrintStream - * - * @param out the PrintStream to which the tables should be written - */ - public void print(PrintStream out) { - out.println(GATKREPORT_HEADER_PREFIX + getVersion().toString() + SEPARATOR + getTables().size()); - for (GATKReportTable table : tables.values()) - table.write(out); - } - - public Collection getTables() { - return tables.values(); - } - - /** - * This is the main function is charge of gathering the reports. It checks that the reports are compatible and then - * calls the table gathering functions. - * - * @param input another GATKReport of the same format - */ - public void concat(GATKReport input) { - - if ( !isSameFormat(input) ) { - throw new ReviewedGATKException("Failed to combine GATKReport, format doesn't match!"); - } - - for ( Map.Entry table : tables.entrySet() ) { - table.getValue().concat(input.getTable(table.getKey())); - } - } - - public GATKReportVersion getVersion() { - return version; - } - - /** - * Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything - * in between. This does not check if the data inside is the same. This is the check to see if the two reports are - * gatherable or reduceable. - * - * @param report another GATK report - * @return true if the the reports are gatherable - */ - public boolean isSameFormat(GATKReport report) { - if (!version.equals(report.version)) { - return false; - } - if (!tables.keySet().equals(report.tables.keySet())) { - return false; - } - for (String tableName : tables.keySet()) { - if (!getTable(tableName).isSameFormat(report.getTable(tableName))) - return false; - } - return true; - } - - /** - * Checks that the reports are exactly the same. - * - * @param report another GATK report - * @return true if all field in the reports, tables, and columns are equal. - */ - public boolean equals(GATKReport report) { - if (!version.equals(report.version)) { - return false; - } - if (!tables.keySet().equals(report.tables.keySet())) { - return false; - } - for (String tableName : tables.keySet()) { - if (!getTable(tableName).equals(report.getTable(tableName))) - return false; - } - return true; - } - - /** - * The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need - * the advanced functionality of a full GATK Report. - *

- * A simple GATK Report consists of: - *

- * - A single table - * - No primary key ( it is hidden ) - *

- * Optional: - * - Only untyped columns. As long as the data is an Object, it will be accepted. - * - Default column values being empty strings. - *

- * Limitations: - *

- * - A simple GATK report cannot contain multiple tables. - * - It cannot contain typed columns, which prevents arithmetic gathering. - * - * @param tableName The name of your simple GATK report table - * @param columns The names of the columns in your table - * @return a simplified GATK report - */ - public static GATKReport newSimpleReport(final String tableName, final String... columns) { - return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns); - } - - /** - * @see #newSimpleReport(String, String...) but with a customized description - * @param tableName - * @param desc - * @param columns - * @return - */ - public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) { - GATKReportTable table = new GATKReportTable(tableName, desc, columns.length); - - for (String column : columns) { - table.addColumn(column, ""); - } - - GATKReport output = new GATKReport(); - output.addTable(table); - - return output; - } - - /** - * The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need - * the advanced functionality of a full GATK Report. - *

- * A simple GATK Report consists of: - *

- * - A single table - * - No primary key ( it is hidden ) - *

- * Optional: - * - Only untyped columns. As long as the data is an Object, it will be accepted. - * - Default column values being empty strings. - *

- * Limitations: - *

- * - A simple GATK report cannot contain multiple tables. - * - It cannot contain typed columns, which prevents arithmetic gathering. - * - * @param tableName The name of your simple GATK report table - * @param columns The names of the columns in your table - * @return a simplified GATK report - */ - public static GATKReport newSimpleReport(final String tableName, final List columns) { - GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.size()); - - for (String column : columns) { - table.addColumn(column, ""); - } - - GATKReport output = new GATKReport(); - output.addTable(table); - - return output; - } - - /** - * This method provides an efficient way to populate a simplified GATK report. This method will only work on reports - * that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information. - * - * @param values the row of data to be added to the table. - * Note: the number of arguments must match the columns in the table. - */ - public void addRow(final Object... values) { - // Must be a simple report - if ( tables.size() != 1 ) - throw new ReviewedGATKException("Cannot write a row to a complex GATK Report"); - - GATKReportTable table = tables.firstEntry().getValue(); - if ( table.getNumColumns() != values.length ) - throw new ReviewedGATKException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" ); - - final int rowIndex = table.getNumRows(); - for ( int i = 0; i < values.length; i++ ) - table.set(rowIndex, i, values[i]); - } - - /** - * This method provides an efficient way to populate a simplified GATK report. This method will only work on reports - * that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information. - * - * @param values the row of data to be added to the table. - * Note: the number of arguments must match the columns in the table. - */ - public void addRowList(final List values) { - if ( tables.size() != 1 ) - throw new ReviewedGATKException("Cannot write a row to a complex GATK Report"); - - GATKReportTable table = tables.firstEntry().getValue(); - if ( table.getNumColumns() != values.size() ) - throw new ReviewedGATKException("The number of arguments in writeRow() must match the number of columns in the table"); - - final int rowIndex = table.getNumRows(); - int idx = 0; - for ( Object value : values ) { - table.set(rowIndex,idx,value); - idx++; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumn.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumn.java deleted file mode 100644 index ffdefff36..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumn.java +++ /dev/null @@ -1,147 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.apache.commons.lang.math.NumberUtils; - -import java.util.Arrays; -import java.util.Collection; - -/** - * column information within a GATK report table - */ -public class GATKReportColumn { - final private String columnName; - final private String format; - final private GATKReportDataType dataType; - - private GATKReportColumnFormat columnFormat; - private GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT; // default alignment is to the right unless values added ask for a left alignment - private int maxWidth = 0; - - /** - * Construct the column object, specifying the column name, default value, whether or not the column should be - * displayed, and the format string. This cannot be null. - * - * @param columnName the name of the column - * @param format format string - */ - public GATKReportColumn(final String columnName, final String format) { - this.columnName = columnName; - this.maxWidth = columnName.length(); - if ( format.equals("") ) { - this.format = "%s"; - this.dataType = GATKReportDataType.Unknown; - } - else { - this.format = format; - this.dataType = GATKReportDataType.fromFormatString(format); - } - } - - /** - * Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed - * width. - * - * @return the format string for this column - */ - public GATKReportColumnFormat getColumnFormat() { - if (columnFormat != null) - return columnFormat; - - columnFormat = new GATKReportColumnFormat(maxWidth, alignment); - return columnFormat; - } - - private static final Collection RIGHT_ALIGN_STRINGS = Arrays.asList( - "null", - "NA", - String.valueOf(Double.POSITIVE_INFINITY), - String.valueOf(Double.NEGATIVE_INFINITY), - String.valueOf(Double.NaN)); - - /** - * Check if the value can be right aligned. Does not trim the values before checking if numeric since it assumes - * the spaces mean that the value is already padded. - * - * @param value to check - * @return true if the value is a right alignable - */ - protected static boolean isRightAlign(final String value) { - return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value.trim()); - } - - /** - * Returns a string version of the values. - * - * @param obj The object to convert to a string - * @return The string representation of the column - */ - private String formatValue(final Object obj) { - String value; - if (obj == null) { - value = "null"; - } - else if ( dataType.equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) { - value = String.format("%.8f", obj); - } - else - value = String.format(format, obj); - - return value; - } - - public GATKReportDataType getDataType() { - return dataType; - } - - public String getColumnName() { - return columnName; - } - - public String getFormat() { - return dataType.equals(GATKReportDataType.Unknown) ? "%s" : format; - } - - public void updateFormatting(final Object value) { - if (value != null) { - final String formatted = formatValue(value); - if ( formatted.length() > 0 ) { - updateMaxWidth(formatted); - updateFormat(formatted); - } - } - } - - private void updateMaxWidth(final String formatted) { - maxWidth = Math.max(formatted.length(), maxWidth); - } - - private void updateFormat(final String formatted) { - if (alignment == GATKReportColumnFormat.Alignment.RIGHT) - alignment = isRightAlign(formatted) ? GATKReportColumnFormat.Alignment.RIGHT : GATKReportColumnFormat.Alignment.LEFT; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumnFormat.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumnFormat.java deleted file mode 100644 index 664b503b0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportColumnFormat.java +++ /dev/null @@ -1,63 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -/** - * Column width and left/right alignment. - */ -public class GATKReportColumnFormat { - public static enum Alignment { LEFT, RIGHT } - private final int width; - private final Alignment alignment; - - public GATKReportColumnFormat(int width, Alignment alignment) { - this.width = width; - this.alignment = alignment; - } - - public int getWidth() { - return width; - } - - public Alignment getAlignment() { - return alignment; - } - - public String getNameFormat() { - return "%-" + width + "s"; - } - - public String getValueFormat() { - switch (alignment) { - case LEFT: - return "%-" + width + "s"; - case RIGHT: - return "%" + width + "s"; - default: - throw new UnsupportedOperationException("Unknown alignment: " + alignment); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportDataType.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportDataType.java deleted file mode 100644 index acfa74f25..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportDataType.java +++ /dev/null @@ -1,236 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import java.util.EnumSet; -import java.util.HashMap; -import java.util.Map; - -/** - * The gatherable data types acceptable in a GATK report column. - */ -public enum GATKReportDataType { - /** - * The null type should not be used. - */ - Null("Null"), - - /** - * The default value when a format string is not present - */ - Unknown("Unknown"), - - /** - * Used for boolean values. Will display as true or false in the table. - */ - Boolean("%[Bb]"), - - /** - * Used for char values. Will display as a char so use printable values! - */ - Character("%[Cc]"), - - /** - * Used for float and double values. Will output a decimal with format %.8f unless otherwise specified. - */ - Decimal("%.*[EeFf]"), - - /** - * Used for int, byte, short, and long values. Will display the full number by default. - */ - Integer("%[Dd]"), - - /** - * Used for string values. Displays the string itself. - */ - String("%[Ss]"); - - private final String dataTypeString; - - private GATKReportDataType(String dataTypeString) { - this.dataTypeString = dataTypeString; - } - - private static final Map lookup = new HashMap(); - - static { - for (GATKReportDataType s : EnumSet.allOf(GATKReportDataType.class)) - lookup.put(s.dataTypeString, s); - } - - - @Override - public String toString() { - return this.dataTypeString; - } - - /** - * Returns a GATK report data type from the Object specified. It looks through the list of acceptable classes and - * returns the appropriate data type. - * - * @param object the object ot derive the data type from - * @return the appropriate data type - */ - public static GATKReportDataType fromObject(Object object) { - GATKReportDataType value; - if (object instanceof Boolean) { - value = GATKReportDataType.Boolean; - - } else if (object instanceof Character) { - value = GATKReportDataType.Character; - - } else if (object instanceof Float || - object instanceof Double) { - value = GATKReportDataType.Decimal; - - } else if (object instanceof Integer || - object instanceof Long || - object instanceof Short || - object instanceof Byte ) { - value = GATKReportDataType.Integer; - - } else if (object instanceof String) { - value = GATKReportDataType.String; - - } else { - value = GATKReportDataType.Unknown; - //throw new UserException("GATKReport could not convert the data object into a GATKReportDataType. Acceptable data objects are found in the documentation."); - } - return value; - } - - /** - * Returns a GATK report data type from the format string specified. It uses regex matching from the enumerated - * Strings. - * - * @param format the format string to derive the data type from - * @return the appropriate data type - */ - public static GATKReportDataType fromFormatString(String format) { - if (format.equals("")) - return Unknown; - for (GATKReportDataType type : lookup.values()) { - if (format.matches(type.toString()) ) - return type; - } - return Unknown; - } - - /** - * Returns the default value of the data type. It returns an object that matches the class of the data type. - * - * @return an object that matches the data type - */ - public Object getDefaultValue() { - switch (this) { - case Decimal: - return 0.0D; - case Boolean: - return false; - case Character: - return '0'; - case Integer: - return 0L; - case String: - return ""; - default: - return null; - } - } - - /** - * Checks if the two objects are equal using the appropriate test form the data types. - * - * @param a an object - * @param b another object to check if equal - * @return true - the objects are equal, false - the objects are nto equal - */ - public boolean isEqual(Object a, Object b) { - switch (this) { - case Null: - return true; - case Decimal: - case Boolean: - case Integer: - return a.toString().equals(b.toString()); - case Character: - case String: - default: - return a.equals(b); - } - } - - /** - * Converts an input String to the appropriate type using the data type. Used for parsing loading a GATK report from - * file. - * - * @param obj The input string - * @return an object that matches the data type. - */ - Object Parse(Object obj) { - if (obj instanceof String) { - String str = obj.toString(); - switch (this) { - case Decimal: - return Double.parseDouble(str); - case Boolean: - return java.lang.Boolean.parseBoolean(str); - case Integer: - return Long.parseLong(str); - case String: - return str; - case Character: - return str.toCharArray()[0]; - default: - return str; - } - } else - return null; - } - - /** - * Returns a format string version of the value according to the data type. - * - * @return The printf string representation of the object according to data type. - */ - public String getDefaultFormatString() { - switch (this) { - case Decimal: - return "%.8f"; - case Boolean: - return "%b"; - case Integer: - return "%d"; - case String: - return "%s"; - case Character: - return "%c"; - case Null: - default: - return "%s"; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportGatherer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportGatherer.java deleted file mode 100644 index 5f7f7670c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportGatherer.java +++ /dev/null @@ -1,62 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.commandline.Gatherer; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; -import java.util.List; - -public class GATKReportGatherer extends Gatherer { - @Override - public void gather(List inputs, File output) { - //Combines inputs GATKReport to one output - - PrintStream o; - try { - o = new PrintStream(output); - } catch (FileNotFoundException e) { - throw new UserException(String.format("File %s to be output by GATKReportGatherer function was not found", output)); - } - - GATKReport current = new GATKReport(); - boolean isFirst = true; - for (File input : inputs) { - if (isFirst) { - current = new GATKReport(input); - isFirst = false; - } else { - current.concat(new GATKReport(input)); - } - } - - current.print(o); - o.close(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportTable.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportTable.java deleted file mode 100644 index 6a1e456d4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportTable.java +++ /dev/null @@ -1,779 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.text.TextFormattingUtils; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.PrintStream; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class GATKReportTable { - /** - * REGEX that matches any table with an invalid name - */ - public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]"; - private static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable"; - private static final String SEPARATOR = ":"; - private static final String ENDLINE = ":;"; - - private final String tableName; - private final String tableDescription; - - private final TableSortingWay sortingWay; - - private List underlyingData; - private final List columnInfo; - private final Map columnNameToIndex; - private final HashMap rowIdToIndex; - - private static final String COULD_NOT_READ_HEADER = "Could not read the header of this file -- "; - private static final String COULD_NOT_READ_COLUMN_NAMES = "Could not read the column names of this file -- "; - private static final String COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- "; - private static final String COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- "; - private static final String OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables"; - - private static final int INITITAL_ARRAY_SIZE = 10000; - private static final String NUMBER_CONVERSION_EXCEPTION = "String is a number but is not a long or a double: "; - - protected enum TableDataHeaderFields { - COLS(2), - ROWS(3), - FORMAT_START(4); - - private final int index; - TableDataHeaderFields(int index) { this.index = index; } - public int index() { return index; } - } - - public enum TableSortingWay { - SORT_BY_ROW, - SORT_BY_COLUMN, - DO_NOT_SORT - } - - protected enum TableNameHeaderFields { - NAME(2), - DESCRIPTION(3); - - private final int index; - TableNameHeaderFields(int index) { this.index = index; } - public int index() { return index; } - } - - /** - * Construct a new GATK report table from the reader - * Note that the row ID mappings are just the index -> index - * - * @param reader the reader - * @param version the GATK report version - */ - public GATKReportTable(BufferedReader reader, GATKReportVersion version) { - - switch ( version ) { - case V1_1: - // read in the header lines - final String[] tableData, tableNameData; - try { - tableData = reader.readLine().split(SEPARATOR); - tableNameData = reader.readLine().split(SEPARATOR); - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_HEADER + e.getMessage()); - } - - // parse the header fields - tableName = tableNameData[TableNameHeaderFields.NAME.index()]; - tableDescription = (tableNameData.length <= TableNameHeaderFields.DESCRIPTION.index()) ? "" : tableNameData[TableNameHeaderFields.DESCRIPTION.index()]; // table may have no description! (and that's okay) - - // when reading from a file, we do not re-sort the rows - sortingWay = TableSortingWay.DO_NOT_SORT; - - // initialize the data - final int nColumns = Integer.parseInt(tableData[TableDataHeaderFields.COLS.index()]); - final int nRows = Integer.parseInt(tableData[TableDataHeaderFields.ROWS.index()]); - underlyingData = new ArrayList(nRows); - columnInfo = new ArrayList(nColumns); - columnNameToIndex = new HashMap(nColumns); - - // when reading from a file, the row ID mapping is just the index - rowIdToIndex = new HashMap(); - for ( int i = 0; i < nRows; i++ ) - rowIdToIndex.put(i, i); - - // read the column names - final String columnLine; - try { - columnLine = reader.readLine(); - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_COLUMN_NAMES); - } - - final List columnStarts = TextFormattingUtils.getWordStarts(columnLine); - final String[] columnNames = TextFormattingUtils.splitFixedWidth(columnLine, columnStarts); - - // Put in columns using the format string from the header - for ( int i = 0; i < nColumns; i++ ) { - final String format = tableData[TableDataHeaderFields.FORMAT_START.index() + i]; - addColumn(columnNames[i], format); - } - - // fill in the table - try { - for ( int i = 0; i < nRows; i++ ) { - // read a data line - final String dataLine = reader.readLine(); - final List lineSplits = Arrays.asList(TextFormattingUtils.splitFixedWidth(dataLine, columnStarts)); - - underlyingData.add(new Object[nColumns]); - for ( int columnIndex = 0; columnIndex < nColumns; columnIndex++ ) { - - final GATKReportDataType type = columnInfo.get(columnIndex).getDataType(); - final String columnName = columnNames[columnIndex]; - set(i, columnName, type.Parse(lineSplits.get(columnIndex))); - - } - } - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_DATA_LINE + e.getMessage()); - } - - try { - reader.readLine(); - } catch (IOException e) { - throw new ReviewedGATKException(COULD_NOT_READ_EMPTY_LINE + e.getMessage()); - } - break; - - default: - throw new ReviewedGATKException(OLD_GATK_TABLE_VERSION); - } - } - - /** - * Construct a new GATK report table with the specified name and description - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - */ - public GATKReportTable(final String tableName, final String tableDescription, final int numColumns) { - this(tableName, tableDescription, numColumns, TableSortingWay.SORT_BY_ROW); - } - - /** - * Construct a new GATK report table with the specified name and description and whether to sort rows by the row ID. - * - * @param tableName the name of the table - * @param tableDescription the description of the table - * @param numColumns the number of columns in this table - * @param sortingWay in what way to sort rows (instead of the order in which they were added) - */ - public GATKReportTable(final String tableName, final String tableDescription, final int numColumns, final TableSortingWay sortingWay) { - if ( !isValidName(tableName) ) { - throw new ReviewedGATKException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); - } - - if ( !isValidDescription(tableDescription) ) { - throw new ReviewedGATKException("Attempted to set a GATKReportTable description of '" + tableDescription + "'. GATKReportTable descriptions must not contain newlines."); - } - - this.tableName = tableName; - this.tableDescription = tableDescription; - this.sortingWay = sortingWay; - - underlyingData = new ArrayList(INITITAL_ARRAY_SIZE); - columnInfo = new ArrayList(numColumns); - columnNameToIndex = new HashMap(numColumns); - rowIdToIndex = new HashMap(); - } - - /** - * Create a new GATKReportTable with the same structure - * @param tableToCopy - */ - public GATKReportTable(final GATKReportTable tableToCopy, final boolean copyData) { - this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortingWay); - for ( final GATKReportColumn column : tableToCopy.getColumnInfo() ) - addColumn(column.getColumnName(), column.getFormat()); - if ( copyData ) - throw new IllegalArgumentException("sorry, copying data in GATKReportTable isn't supported"); - } - - /** - * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed - * - * @param name the name of the table or column - * @return true if the name is valid, false if otherwise - */ - private boolean isValidName(String name) { - Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX); - Matcher m = p.matcher(name); - - return !m.find(); - } - - /** - * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed - * - * @param description the name of the table or column - * @return true if the name is valid, false if otherwise - */ - private boolean isValidDescription(String description) { - Pattern p = Pattern.compile("\\r|\\n"); - Matcher m = p.matcher(description); - - return !m.find(); - } - - /** - * Add a mapping from ID to the index of a new row added to the table. - * - * @param ID the unique ID - */ - public void addRowID(final String ID) { - addRowID(ID, false); - } - - /** - * Add a mapping from ID to the index of a new row added to the table. - * - * @param ID the unique ID - * @param populateFirstColumn should we automatically populate the first column with the row's ID? - */ - public void addRowID(final String ID, final boolean populateFirstColumn) { - addRowIDMapping(ID, underlyingData.size(), populateFirstColumn); - } - - /** - * Add a mapping from ID to row index. - * - * @param ID the unique ID - * @param index the index associated with the ID - */ - public void addRowIDMapping(final String ID, final int index) { - addRowIDMapping(ID, index, false); - } - - /** - * Add a mapping from ID to row index. - * - * @param ID the unique ID - * @param index the index associated with the ID - * @param populateFirstColumn should we automatically populate the first column with the row's ID? - */ - public void addRowIDMapping(final Object ID, final int index, final boolean populateFirstColumn) { - expandTo(index, false); - rowIdToIndex.put(ID, index); - - if ( populateFirstColumn ) - set(index, 0, ID); - } - - /** - * Remove a mapping from ID to row index. - * - * @param ID the row ID - */ - public void removeRowIDMapping(final Object ID) { - rowIdToIndex.remove(ID); - } - - /** - * Add a column to the report - * - * @param columnName the name of the column - */ - public void addColumn(String columnName) { - addColumn(columnName, ""); - } - - /** - * Add a column to the report and the format string used to display the data. - * - * @param columnName the name of the column - * @param format the format string used to display data - */ - public void addColumn(String columnName, String format) { - columnNameToIndex.put(columnName, columnInfo.size()); - columnInfo.add(new GATKReportColumn(columnName, format)); - } - - /** - * Check if the requested cell is valid and expand the table if necessary - * - * @param rowIndex the row index - * @param colIndex the column index - */ - private void verifyEntry(final int rowIndex, final int colIndex) { - if ( rowIndex < 0 || colIndex < 0 || colIndex >= getNumColumns() ) - throw new ReviewedGATKException("attempted to access a cell that does not exist in table '" + tableName + "'"); - } - - /** - * expand the underlying table if needed to include the given row index - * - * @param rowIndex the row index - * @param updateRowIdMap should we update the row ID map? - */ - private void expandTo(final int rowIndex, final boolean updateRowIdMap) { - int currentSize = underlyingData.size(); - if ( rowIndex >= currentSize ) { - final int numNewRows = rowIndex - currentSize + 1; - for ( int i = 0; i < numNewRows; i++ ) { - if ( updateRowIdMap ) - rowIdToIndex.put(currentSize, currentSize); - underlyingData.add(new Object[getNumColumns()]); - currentSize++; - } - } - } - - /** - * Set the value for a given position in the table. - * If the row ID doesn't exist, it will create a new row in the table with the given ID. - * - * @param rowID the row ID - * @param columnName the name of the column - * @param value the value to set - */ - public void set(final Object rowID, final String columnName, final Object value) { - if ( !rowIdToIndex.containsKey(rowID) ) { - rowIdToIndex.put(rowID, underlyingData.size()); - expandTo(underlyingData.size(), false); - } - set(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName), value); - } - - /** - * Set the value for a given position in the table. - * If the row index doesn't exist, it will create new rows in the table accordingly. - * - * @param rowIndex the row index - * @param colIndex the column index - * @param value the value to set - */ - public void set(final int rowIndex, final int colIndex, Object value) { - expandTo(rowIndex, true); - verifyEntry(rowIndex, colIndex); - GATKReportColumn column = columnInfo.get(colIndex); - - // We do not accept internal null values - if (value == null) - value = "null"; - else - value = fixType(value, column); - - if ( column.getDataType().equals(GATKReportDataType.fromObject(value)) || column.getDataType().equals(GATKReportDataType.Unknown) ) { - underlyingData.get(rowIndex)[colIndex] = value; - column.updateFormatting(value); - } else { - throw new ReviewedGATKException(String.format("Tried to add an object of type: %s to a column of type: %s", GATKReportDataType.fromObject(value).name(), column.getDataType().name())); - } - } - - /** - * Returns true if the table contains a row mapping with the given ID - * - * @param rowID the row ID - */ - public boolean containsRowID(final Object rowID) { - return rowIdToIndex.containsKey(rowID); - } - - /** - * Returns the row mapping IDs - * - */ - public Collection getRowIDs() { - return rowIdToIndex.keySet(); - } - - /** - * Increment the value for a given position in the table. - * Throws an exception if the value in the cell is not an integer. - * - * @param rowID the row ID - * @param columnName the name of the column - */ - public void increment(final Object rowID, final String columnName) { - int prevValue; - if ( !rowIdToIndex.containsKey(rowID) ) { - rowIdToIndex.put(rowID, underlyingData.size()); - underlyingData.add(new Object[getNumColumns()]); - prevValue = 0; - } else { - Object obj = get(rowID, columnName); - if ( !(obj instanceof Integer) ) - throw new ReviewedGATKException("Attempting to increment a value in a cell that is not an integer"); - prevValue = (Integer)obj; - } - - set(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName), prevValue + 1); - } - - /** - * Returns the index of the first row matching the column values. - * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" - * - * @param columnValues column values. - * @return The index of the first row matching the column values or -1 if no such row exists. - */ - public int findRowByData(final Object... columnValues) { - if ( columnValues == null || columnValues.length == 0 || columnValues.length > getNumColumns() ) - return -1; - - for ( int rowIndex = 0; rowIndex < underlyingData.size(); rowIndex++ ) { - - final Object[] row = underlyingData.get(rowIndex); - - boolean matches = true; - for ( int colIndex = 0; colIndex < columnValues.length; colIndex++ ) { - if ( !columnValues[colIndex].equals(row[colIndex]) ) { - matches = false; - break; - } - } - - if ( matches ) - return rowIndex; - } - - return -1; - } - - private Object fixType(final Object value, final GATKReportColumn column) { - // Below is some code to convert a string into its appropriate type. - - // todo -- Types have to be more flexible. For example, %d should accept Integers, Shorts and Bytes. - - Object newValue = null; - if ( value instanceof String && !column.getDataType().equals(GATKReportDataType.String) ) { - // Integer case - if ( column.getDataType().equals(GATKReportDataType.Integer) ) { - try { - newValue = Long.parseLong((String) value); - } catch (Exception e) { - /** do nothing */ - } - } - if ( column.getDataType().equals(GATKReportDataType.Decimal) ) { - try { - newValue = Double.parseDouble((String) value); - } catch (Exception e) { - /** do nothing */ - } - } - if ( column.getDataType().equals(GATKReportDataType.Character) && ((String) value).length() == 1 ) { - newValue = ((String) value).charAt(0); - } - } - - return (newValue != null) ? newValue : value; - } - - /** - * Get a value from the given position in the table - * - * @param rowID the row ID - * @param columnName the name of the column - * @return the value stored at the specified position in the table - */ - public Object get(final Object rowID, final String columnName) { - return get(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName)); - } - - /** - * Get a value from the given position in the table - * - * @param rowIndex the row ID - * @param columnName the name of the column - * @return the value stored at the specified position in the table - */ - public Object get(final int rowIndex, final String columnName) { - return get(rowIndex, columnNameToIndex.get(columnName)); - } - - /** - * Get a value from the given position in the table - * - * @param rowIndex the index of the row - * @param columnIndex the index of the column - * @return the value stored at the specified position in the table - */ - public Object get(int rowIndex, int columnIndex) { - verifyEntry(rowIndex, columnIndex); - return underlyingData.get(rowIndex)[columnIndex]; - } - - /** - * Write the table to the PrintStream, formatted nicely to be human-readable, AWK-able, and R-friendly. - * - * @param out the PrintStream to which the table should be written - */ - void write(final PrintStream out) { - - /* - * Table header: - * #:GATKTable:nColumns:nRows:(DataType for each column):; - * #:GATKTable:TableName:Description :; - * key colA colB - * row1 xxxx xxxxx - */ - - // write the table definition - out.printf(GATKTABLE_HEADER_PREFIX + ":%d:%d", getNumColumns(), getNumRows()); - - // write the formats for all the columns - for ( final GATKReportColumn column : columnInfo ) - out.print(SEPARATOR + column.getFormat()); - out.println(ENDLINE); - - // write the table name & description - out.printf(GATKTABLE_HEADER_PREFIX + ":%s:%s\n", tableName, tableDescription); - - // write the column names - boolean needsPadding = false; - for ( final GATKReportColumn column : columnInfo ) { - if ( needsPadding ) - out.printf(" "); - needsPadding = true; - - out.printf(column.getColumnFormat().getNameFormat(), column.getColumnName()); - } - out.println(); - - // write the table body - switch (sortingWay) { - case SORT_BY_COLUMN: - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; - - int result = EQUAL; - - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - } - if( result != EQUAL) { - return result; - } - } - return result; - } - }); - for ( final Object[] row : underlyingData ) - writeRow(out, row); - break; - case SORT_BY_ROW: - // make sure that there are exactly the correct number of ID mappings - if ( rowIdToIndex.size() != underlyingData.size() ) - throw new ReviewedGATKException("There isn't a 1-to-1 mapping from row ID to index; this can happen when rows are not created consistently"); - - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - throw new ReviewedGATKException("Unable to sort the rows based on the row IDs because the ID Objects are of different types"); - } - for ( final Map.Entry rowKey : sortedMap.entrySet() ) - writeRow(out, underlyingData.get(rowKey.getValue())); - break; - case DO_NOT_SORT: - for ( final Object[] row : underlyingData ) - writeRow(out, row); - } - out.println(); - } - - private void writeRow(final PrintStream out, final Object[] row) { - boolean needsPadding = false; - for ( int i = 0; i < row.length; i++ ) { - if ( needsPadding ) - out.printf(" "); - needsPadding = true; - - final Object obj = row[i]; - final String value; - - final GATKReportColumn info = columnInfo.get(i); - - if ( obj == null ) - value = "null"; - else if ( info.getDataType().equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) - value = String.format("%.8f", obj); - else - value = String.format(info.getFormat(), obj); - - out.printf(info.getColumnFormat().getValueFormat(), value); - } - - out.println(); - } - - public int getNumRows() { - return underlyingData.size(); - } - - public int getNumColumns() { - return columnInfo.size(); - } - - public List getColumnInfo() { - return columnInfo; - } - - public String getTableName() { - return tableName; - } - - public String getTableDescription() { - return tableDescription; - } - - /** - * Concatenates the rows from the table to this one - * - * @param table another GATK table - */ - public void concat(final GATKReportTable table) { - if ( !isSameFormat(table) ) - throw new ReviewedGATKException("Error trying to concatenate tables with different formats"); - - // add the data - underlyingData.addAll(table.underlyingData); - - // update the row index map - final int currentNumRows = getNumRows(); - for ( Map.Entry entry : table.rowIdToIndex.entrySet() ) - rowIdToIndex.put(entry.getKey(), entry.getValue() + currentNumRows); - } - - /** - * Returns whether or not the two tables have the same format including columns and everything in between. This does - * not check if the data inside is the same. This is the check to see if the two tables are gatherable or - * reduceable - * - * @param table another GATK table - * @return true if the the tables are gatherable - */ - public boolean isSameFormat(final GATKReportTable table) { - if ( !tableName.equals(table.tableName) || - !tableDescription.equals(table.tableDescription) || - columnInfo.size() != table.columnInfo.size() ) - return false; - - for ( int i = 0; i < columnInfo.size(); i++ ) { - if ( !columnInfo.get(i).getFormat().equals(table.columnInfo.get(i).getFormat()) || - !columnInfo.get(i).getColumnName().equals(table.columnInfo.get(i).getColumnName()) ) - return false; - } - - return true; - } - - /** - * Checks that the tables are exactly the same. - * - * @param table another GATK report - * @return true if all field in the reports, tables, and columns are equal. - */ - public boolean equals(final GATKReportTable table) { - if ( !isSameFormat(table) || - underlyingData.size() != table.underlyingData.size() ) - return false; - - final List myOrderedRows = getOrderedRows(); - final List otherOrderedRows = table.getOrderedRows(); - - for ( int i = 0; i < underlyingData.size(); i++ ) { - final Object[] myData = myOrderedRows.get(i); - final Object[] otherData = otherOrderedRows.get(i); - for ( int j = 0; j < myData.length; j++ ) { - if ( !myData[j].toString().equals(otherData[j].toString()) ) // need to deal with different typing (e.g. Long vs. Integer) - return false; - } - } - - return true; - } - - private List getOrderedRows() { - - switch (sortingWay) { - case SORT_BY_COLUMN: - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; - int result = EQUAL; - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - } - if( result != EQUAL) { - return result; - } - } - return result; - } - }); - return underlyingData; - case SORT_BY_ROW: - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - return underlyingData; - } - - final List orderedData = new ArrayList(underlyingData.size()); - for ( final int rowKey : sortedMap.values() ) - orderedData.add(underlyingData.get(rowKey)); - - return orderedData; - default: - return underlyingData; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportVersion.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportVersion.java deleted file mode 100644 index 226365b80..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/report/GATKReportVersion.java +++ /dev/null @@ -1,101 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -public enum GATKReportVersion { - /** - * Differences between other versions: - * - Does not allow spaces in cells. - * - Mostly fixed width but has a bug where the string width of floating point - * values was not measured correctly leading to columns that aren't aligned - */ - V0_1("v0.1"), - - /** - * Differences between other versions: - * - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6". - * - Fixed width fixed for floating point values - */ - V0_2("v0.2"), - - /* - * Differences between v0.x - * - Added table and report headers - * - Headers changed format, include the number of tables, rows, and metadata for gathering - * - IS GATHERABLE - */ - V1_0("v1.0"), - - /* - * Differences between v1.0 - * - column numbers in header reflect the actual count of columns - * - primary keys are never displayed - */ - V1_1("v1.1"); - - private final String versionString; - - private GATKReportVersion(String versionString) { - this.versionString = versionString; - } - - @Override - public String toString() { - return versionString; - } - - public boolean equals(GATKReportVersion that) { - return (versionString.equals(that.versionString)); - } - - /** - * Returns the GATK Report Version from the file header. - * - * @param header Header from the file starting with ##:GATKReport.v[version] - * @return The version as an enum. - */ - public static GATKReportVersion fromHeader(String header) { - if ( header == null ) - throw new UserException.BadInput("The GATK report has no version specified in the header"); - - if (header.startsWith("##:GATKReport.v0.1 ")) - return GATKReportVersion.V0_1; - - if (header.startsWith("##:GATKReport.v0.2 ")) - return GATKReportVersion.V0_2; - - if (header.startsWith("#:GATKReport.v1.0")) - return GATKReportVersion.V1_0; - - if (header.startsWith("#:GATKReport.v1.1")) - return GATKReportVersion.V1_1; - - throw new UserException.BadInput("The GATK report has an unknown/unsupported version in the header: " + header); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java deleted file mode 100644 index 6fdb9fa0a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/samples/SampleDBBuilder.java +++ /dev/null @@ -1,161 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.samples; - -import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -/** - * - */ -public class SampleDBBuilder { - PedigreeValidationType validationStrictness; - final SampleDB sampleDB = new SampleDB(); - final GenomeAnalysisEngine engine; - - Set samplesFromDataSources = new HashSet(); - Set samplesFromPedigrees = new HashSet(); - - /** for testing only */ - protected SampleDBBuilder(PedigreeValidationType validationStrictness) { - engine = null; - this.validationStrictness = validationStrictness; - } - - /** - * Constructor takes both a SAM header and sample files because the two must be integrated. - */ - public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { - this.engine = engine; - this.validationStrictness = validationStrictness; - } - - /** - * Hallucinates sample objects for all the samples in the SAM file and stores them - */ - public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { - addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); - return this; - } - - public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { - for (final String sampleName : sampleNames) { - if (sampleDB.getSample(sampleName) == null) { - final Sample newSample = new Sample(sampleName, sampleDB); - sampleDB.addSample(newSample); - samplesFromDataSources.add(newSample); // keep track of data source samples - } - } - return this; - } - - public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { - for (final File pedFile : pedigreeFiles) { - Collection samples = addSamplesFromPedigreeArgument(pedFile); - samplesFromPedigrees.addAll(samples); - } - - return this; - } - - public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { - for (final String pedString : pedigreeStrings) { - Collection samples = addSamplesFromPedigreeArgument(pedString); - samplesFromPedigrees.addAll(samples); - } - - return this; - } - - /** - * Parse one sample file and integrate it with samples that are already there - * Fail quickly if we find any errors in the file - */ - private Collection addSamplesFromPedigreeArgument(File sampleFile) { - final PedReader reader = new PedReader(); - - try { - return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(sampleFile, e); - } - } - - private Collection addSamplesFromPedigreeArgument(final String string) { - final PedReader reader = new PedReader(); - return reader.parse(string, getMissingFields(string), sampleDB); - } - - public SampleDB getFinalSampleDB() { - validate(); - return sampleDB; - } - - public EnumSet getMissingFields(final Object engineArg) { - if ( engine == null ) - return EnumSet.noneOf(PedReader.MissingPedField.class); - else { - final List posTags = engine.getTags(engineArg).getPositionalTags(); - return PedReader.parseMissingFieldTags(engineArg, posTags); - } - } - - // -------------------------------------------------------------------------------- - // - // Validation - // - // -------------------------------------------------------------------------------- - - protected final void validate() { - validatePedigreeIDUniqueness(); - if ( validationStrictness != PedigreeValidationType.SILENT ) { - // check that samples in data sources are all annotated, if anything is annotated - if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { - final Set sampleNamesFromPedigrees = new HashSet(); - for ( final Sample pSample : samplesFromPedigrees ) - sampleNamesFromPedigrees.add(pSample.getID()); - - for ( final Sample dsSample : samplesFromDataSources ) - if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) - throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation"); - } - } - } - - private void validatePedigreeIDUniqueness() { - Set pedigreeIDs = new HashSet(); - for ( Sample sample : samplesFromPedigrees ) { - pedigreeIDs.add(sample.getID()); - } - assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?"; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java deleted file mode 100644 index d28ea3be4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TAROrderedReadCache.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import org.broadinstitute.gatk.engine.downsampling.Downsampler; -import org.broadinstitute.gatk.engine.downsampling.ReservoirDownsampler; -import org.broadinstitute.gatk.utils.sam.AlignmentStartComparator; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -/** - * Subsystem to track a list of all reads currently live in the TraverseActiveRegions system, - * while limiting the total number of reads to a maximum capacity. - * - * User: depristo - * Date: 4/7/13 - * Time: 11:23 AM - */ -public class TAROrderedReadCache { - private final int maxCapacity; - private ArrayList undownsampledCache; - private Downsampler downsampler; - - private static final int UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE = 10000; - - /** - * Create a new empty ReadCache - * @param maxCapacity the max capacity of the read cache. - */ - public TAROrderedReadCache( final int maxCapacity ) { - if ( maxCapacity < 0 ) throw new IllegalArgumentException("maxCapacity must be >= 0 but got " + maxCapacity); - this.maxCapacity = maxCapacity; - - // The one we're not currently using will always be null: - initializeUndownsampledCache(); - this.downsampler = null; - } - - /** - * Moves all reads over to the downsampler, causing it to be used from this point on. Should be called - * when the undownsampledCache fills up and we need to start discarding reads. Since the - * ReservoirDownsampler doesn't preserve relative ordering, pop operations become expensive - * after this point, as they require a O(n log n) sort. - */ - private void activateDownsampler() { - downsampler = new ReservoirDownsampler<>(maxCapacity, false); - downsampler.submit(undownsampledCache); - undownsampledCache = null; // preferable to the O(n) clear() method - } - - /** - * Allocate the undownsampled cache used when we have fewer than maxCapacity items - */ - private void initializeUndownsampledCache() { - undownsampledCache = new ArrayList<>(Math.min(maxCapacity + 1, UNDOWNSAMPLED_CACHE_MAX_INITIAL_SIZE)); - } - - /** - * What's the maximum number of reads we'll store in the cache? - * @return a positive integer - */ - public int getMaxCapacity() { - return maxCapacity; - } - - /** - * Add a single read to this cache. Assumed to be in sorted order w.r.t. the previously added reads - * @param read a read to add - */ - public void add( final GATKSAMRecord read ) { - if ( read == null ) throw new IllegalArgumentException("Read cannot be null"); - - if ( downsampler != null ) { - downsampler.submit(read); - } - else { - undownsampledCache.add(read); - - // No more room in the undownsampledCache? Time to start downsampling - if ( undownsampledCache.size() > maxCapacity ) { - activateDownsampler(); - } - } - } - - /** - * Add a collection of reads to this cache. Assumed to be in sorted order w.r.t. the previously added reads and each other - * @param reads a collection of reads to add - */ - public void addAll( final List reads ) { - if ( reads == null ) throw new IllegalArgumentException("Reads cannot be null"); - for ( final GATKSAMRecord read : reads ) { - add(read); - } - } - - /** - * How many reads are currently in the cache? - * @return a positive integer - */ - public int size() { - return downsampler != null ? downsampler.size() : undownsampledCache.size(); - } - - /** - * How many reads were discarded since the last call to popCurrentReads - * - * @return number of items discarded during downsampling since last pop operation - */ - public int getNumDiscarded() { - return downsampler != null ? downsampler.getNumberOfDiscardedItems() : 0; - } - - /** - * Removes all reads currently in the cache, and returns them in sorted order (w.r.t. alignmentStart) - * - * Flushes this cache, so after this call the cache will contain no reads, and we'll be in the same - * initial state as the constructor would put us in, with a non-null undownsampledCache and a null - * downsampler. - * - * @return a list of GATKSAMRecords in this cache - */ - public List popCurrentReads() { - final List poppedReads; - - if ( downsampler == null ) { - poppedReads = undownsampledCache; // avoid making a copy here, since we're going to allocate a new cache - } - else { - // If we triggered the downsampler, we need to sort the reads before returning them, - // since the ReservoirDownsampler is not guaranteed to preserve relative ordering of items. - // After consuming the downsampled items in this call to popCurrentReads(), we switch back - // to using the undownsampledCache until we fill up again. - poppedReads = downsampler.consumeFinalizedItems(); // avoid making a copy here - Collections.sort(poppedReads, new AlignmentStartComparator()); - downsampler = null; - } - - initializeUndownsampledCache(); - return poppedReads; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java deleted file mode 100644 index 7d93311f2..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegions.java +++ /dev/null @@ -1,719 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.ActiveRegionTraversalParameters; -import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfile; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; -import org.broadinstitute.gatk.utils.activeregion.BandPassActivityProfile; -import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.gatk.utils.progressmeter.ProgressMeter; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.Collection; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -/** - * Implement active region traversal - * - * User: depristo - * Date: 1/9/13 - * Time: 4:45 PM - * - * Live region: - * - * The ART tracks a thing called the live region. The live region is a position on a specific contig - * of the alignment start of the last read we processed during this traversal. Because the - * read stream is sorted, future reads must occurs in the the live region. Therefore the the dead region - * (everything to the left of the live boundary) cannot have any more read data. The live / dead - * regions are used to decide when we can safely call map on active regions, as only active regions - * contained completely within the dead region (including extensions) have a complete set of read data - * in the collected read list. All of the data related to the live region is captured by the local - * variable spanOfLastReadSeen - * - */ -public final class TraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { - private final static boolean DEBUG = false; - protected final static Logger logger = Logger.getLogger(TraversalEngine.class); - protected final static boolean LOG_READ_CARRYING = false; - - // set by the traversal - private boolean walkerHasPresetRegions = false; - private int activeRegionExtension = -1; - private int maxRegionSize = -1; - private int minRegionSize = -1; - - private final LinkedList workQueue = new LinkedList<>(); - - private TAROrderedReadCache myReads = null; - - private GenomeLoc lastRegionProcessed = null; - private GenomeLoc spanOfLastReadSeen = null; - private ActivityProfile activityProfile = null; - int maxReadsInMemory = 0; - ActiveRegionWalker walker; - - final NanoScheduler nanoScheduler; - - /** - * Data to use in the ActiveRegionWalker.map function produced by the NanoScheduler input iterator - */ - private static class MapData { - public ActiveRegion activeRegion; - public RefMetaDataTracker tracker; - - private MapData(ActiveRegion activeRegion, RefMetaDataTracker tracker) { - this.activeRegion = activeRegion; - this.tracker = tracker; - } - } - - /** - * Create a single threaded active region traverser - */ - public TraverseActiveRegions() { - this(1); - } - - /** - * Create an active region traverser that uses nThreads for getting its work done - * @param nThreads number of threads - */ - public TraverseActiveRegions(final int nThreads) { - nanoScheduler = new NanoScheduler<>(nThreads); - nanoScheduler.setProgressFunction(new NSProgressFunction() { - @Override - public void progress(MapData lastActiveRegion) { - if ( lastActiveRegion != null ) - // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon - printProgress(lastActiveRegion.activeRegion.getLocation().getStopLocation()); - } - }); - } - - /** - * Have the debugging output streams been initialized already? - * - * We have to do lazy initialization because when the initialize() function is called - * the streams aren't yet initialized in the GATK walker. - */ - private boolean streamsInitialized = false; - - @Override - public void initialize(GenomeAnalysisEngine engine, Walker walker, ProgressMeter progressMeter) { - super.initialize(engine, walker, progressMeter); - - this.walker = (ActiveRegionWalker)walker; - if ( this.walker.wantsExtendedReads() && ! this.walker.wantsNonPrimaryReads() ) { - throw new IllegalArgumentException("Active region walker " + this.walker + " requested extended events but not " + - "non-primary reads, an inconsistent state. Please modify the walker"); - } - - ActiveRegionTraversalParameters annotation = walker.getClass().getAnnotation(ActiveRegionTraversalParameters.class); - this.activeRegionExtension = this.walker.activeRegionExtension == null ? annotation.extension() : this.walker.activeRegionExtension; - this.maxRegionSize = this.walker.activeRegionMaxSize == null ? annotation.maxRegion() : this.walker.activeRegionMaxSize; - this.minRegionSize = annotation.minRegion(); - final double bandPassSigma = this.walker.bandPassSigma == null ? annotation.bandPassSigma() : this.walker.bandPassSigma; - walkerHasPresetRegions = this.walker.hasPresetActiveRegions(); - - activityProfile = new BandPassActivityProfile(engine.getGenomeLocParser(), engine.getIntervals(), this.walker.maxProbPropagationDistance, this.walker.activeProbThreshold, - BandPassActivityProfile.MAX_FILTER_SIZE, bandPassSigma); - - final int maxReadsAcrossSamples = annotation.maxReadsToHoldInMemoryPerSample() * SampleUtils.getSAMFileSamples(engine).size(); - final int maxReadsToHoldInMemory = Math.min(maxReadsAcrossSamples, annotation.maxReadsToHoldTotal()); - myReads = new TAROrderedReadCache(maxReadsToHoldInMemory); - } - - // ------------------------------------------------------------------------------------- - // - // Utility functions - // - // ------------------------------------------------------------------------------------- - - /** - * Load in the preset regions for contig into workQueue - * - * Should be called before starting to process work on contig - * - * Can only be called when walkerHasPresetRegions is true or an IllegalStateException will be thrown - * - * @param contig the contig we are about to process - */ - protected void loadPresetRegionsForContigToWorkQueue(final String contig) { - if ( ! walkerHasPresetRegions ) throw new IllegalStateException("only appropriate to call when walker has preset regions"); - - final GenomeLoc contigSpan = engine.getGenomeLocParser().createOverEntireContig(contig); - for ( final GenomeLoc loc : this.walker.getPresetActiveRegions().getOverlapping(contigSpan) ) { - workQueue.add(new ActiveRegion(loc, null, true, engine.getGenomeLocParser(), getActiveRegionExtension())); - } - } - - protected int getActiveRegionExtension() { - return activeRegionExtension; - } - - protected int getMaxRegionSize() { - return maxRegionSize; - } - - protected int getMinRegionSize() { - return minRegionSize; - } - - @Override - public String getTraversalUnits() { - return "active regions"; - } - - @Override - public String toString() { - return "TraverseActiveRegions"; - } - - /** - * Is the loc outside of the intervals being requested for processing by the GATK? - * @param loc - * @return - */ - protected boolean outsideEngineIntervals(final GenomeLoc loc) { - return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); - } - - // ------------------------------------------------------------------------------------- - // - // Actual traverse function - // - // ------------------------------------------------------------------------------------- - - /** - * Did read appear in the last shard? - * - * When we transition across shard boundaries we see duplicate reads because - * each shard contains the reads that *overlap* the shard. So if we just finished - * shard 1-1000 and are now in 1001-2000 we'll see duplicate reads from 1001 - * that overlapped 1-1000. This function tests read to determine if we would have - * seen it before by asking if read.getAlignmentStart() is less than the - * stop position of the last seen read at the start of the traversal. The reason - * we need to use the location of the last read at the start of the traversal - * is that we update the lastRead during the traversal, and we only want to filter - * out reads whose start is before the last read of the previous shard, not the - * current shard. - * - * @param locOfLastReadAtTraversalStart the location of the last read seen at the start of the traversal - * @param read the read we want to test if it's already been seen in the last shard - * @return true if read would have appeared in the last shard, false otherwise - */ - @Requires({"read != null"}) - private boolean appearedInLastShard(final GenomeLoc locOfLastReadAtTraversalStart, final GATKSAMRecord read) { - if ( locOfLastReadAtTraversalStart == null ) - // we're in the first shard, so obviously the answer is no - return false; - else { - // otherwise check to see if the alignment occurred in the previous shard - return read.getAlignmentStart() <= locOfLastReadAtTraversalStart.getStart() - // we're on the same contig - && read.getReferenceIndex() == locOfLastReadAtTraversalStart.getContigIndex(); - } - - } - - @Override - public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - T sum) { - if ( LOG_READ_CARRYING || logger.isDebugEnabled() ) - logger.info(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - nanoScheduler.setDebug(false); - final Iterator activeRegionIterator = new ActiveRegionIterator(dataProvider); - final TraverseActiveRegionMap myMap = new TraverseActiveRegionMap(); - final TraverseActiveRegionReduce myReduce = new TraverseActiveRegionReduce(); - final T result = nanoScheduler.execute(activeRegionIterator, myMap, sum, myReduce); - - return result; - } - - private class ActiveRegionIterator implements Iterator { - private final LocusShardDataProvider dataProvider; - private LinkedList readyActiveRegions = new LinkedList<>(); - private boolean done = false; - private final LocusView locusView; - private final LocusReferenceView referenceView; - private final GenomeLoc locOfLastReadAtTraversalStart; - private final IntervalReferenceOrderedView referenceOrderedDataView; - private final GenomeLoc currentWindow; - private final boolean processRemainingActiveRegions; - - public ActiveRegionIterator( final LocusShardDataProvider dataProvider ) { - this.dataProvider = dataProvider; - locusView = new AllLocusView(dataProvider); - referenceView = new LocusReferenceView( walker, dataProvider ); - - // The data shard may carry a number of locations to process (due to being indexed together). - // This value is just the interval we are processing within the entire provider - currentWindow = dataProvider.getLocus(); - final int currentWindowPos = dataProvider.getShard().getGenomeLocs().indexOf(currentWindow); - if ( currentWindowPos == -1 ) throw new IllegalStateException("Data provider " + dataProvider + " didn't have our current window in it " + currentWindow); - processRemainingActiveRegions = currentWindowPos == dataProvider.getShard().getGenomeLocs().size() - 1; - - // the rodSpan covers all of the bases in the activity profile, including all of the bases - // through the current window interval. This is because we may issue a query to get data for an - // active region spanning before the current interval as far back as the start of the current profile, - // if we have pending work to do that finalizes in this interval. - final GenomeLoc rodSpan = activityProfile.getSpan() == null ? currentWindow : activityProfile.getSpan().endpointSpan(currentWindow); - if ( ! dataProvider.getShard().getLocation().containsP(rodSpan) ) throw new IllegalStateException("Rod span " + rodSpan + " isn't contained within the data shard " + dataProvider.getShard().getLocation() + ", meaning we wouldn't get all of the data we need"); - referenceOrderedDataView = new IntervalReferenceOrderedView( dataProvider, rodSpan ); - - // We keep processing while the next reference location is within the interval - locOfLastReadAtTraversalStart = spanOfLastSeenRead(); - - // load in the workQueue the present regions that span the current contig, if it's different from the last one - if ( walkerHasPresetRegions && ( lastRegionProcessed == null || ! currentWindow.onSameContig(lastRegionProcessed)) ) { - loadPresetRegionsForContigToWorkQueue(currentWindow.getContig()); - } - - // remember the last region we processed for sanity checking later - lastRegionProcessed = currentWindow; - } - - @Override public void remove() { throw new UnsupportedOperationException("Cannot remove from ActiveRegionIterator"); } - - @Override - public MapData next() { - return readyActiveRegions.pop(); - } - @Override - public boolean hasNext() { - if ( engine.exceedsRuntimeLimit() ) // too much time has been dedicated to doing work, just stop - return false; - if ( ! readyActiveRegions.isEmpty() ) - return true; - if ( done ) - return false; - else { - - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - rememberLastLocusLocation(location); - - // get all of the new reads that appear in the current pileup, and them to our list of reads - // provided we haven't seen them before - final Collection reads = locusView.getLIBS().transferReadsFromAllPreviousPileups(); - for( final GATKSAMRecord read : reads ) { - // note that ActiveRegionShards span entire contigs, so this check is in some - // sense no longer necessary, as any read that appeared in the last shard would now - // by definition be on a different contig. However, the logic here doesn't hurt anything - // and makes us robust should we decided to provide shards that don't fully span - // contigs at some point in the future - if ( ! appearedInLastShard(locOfLastReadAtTraversalStart, read) ) { - rememberLastReadLocation(read); - myReads.add(read); - } - } - - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - // we've move across some interval boundary, restart profile - final boolean flushProfile = ! activityProfile.isEmpty() - && ( activityProfile.getContigIndex() != location.getContigIndex() - || location.getStart() != activityProfile.getStop() + 1); - final List newActiveRegions = prepActiveRegionsForProcessing(walker, flushProfile, false, referenceOrderedDataView); - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation()); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - addIsActiveResult(walker, tracker, refContext, locus); - - maxReadsInMemory = Math.max(myReads.size(), maxReadsInMemory); - printProgress(location); - - if ( ! newActiveRegions.isEmpty() ) { - readyActiveRegions.addAll(newActiveRegions); - if ( DEBUG ) - for ( final MapData region : newActiveRegions ) - logger.info("Adding region to queue for processing " + region.activeRegion); - return true; - } - } - - if ( processRemainingActiveRegions ) { - // we've run out of stuff to process, and since shards now span entire contig boundaries - // we should finalized our regions. This allows us to continue to use our referenceOrderedDataView - // which would otherwise be shutdown. Only followed when the microschedule says that we're - // inside of the last window in the current shard - readyActiveRegions.addAll(prepActiveRegionsForProcessing(walker, true, true, referenceOrderedDataView)); - } - - return ! readyActiveRegions.isEmpty(); - } - } - } - - // ------------------------------------------------------------------------------------- - // - // Functions to manage and interact with the live / dead zone - // - // ------------------------------------------------------------------------------------- - - /** - * Update the live region to reflect that the last read we've seen in the traversal is read - * - * Requires that sequential calls always be provided reads in coordinate sorted order - * - * @param read the last read we've seen during the traversal - */ - @Requires({"read != null"}) - protected void rememberLastReadLocation(final GATKSAMRecord read) { - final GenomeLoc currentLocation = engine.getGenomeLocParser().createGenomeLoc(read); - if ( spanOfLastReadSeen == null ) - spanOfLastReadSeen = currentLocation; - else { - if ( currentLocation.isBefore(spanOfLastReadSeen) ) - throw new IllegalStateException("Updating last read seen in the traversal with read " + read + " with span " + currentLocation + " but this occurs before the previously seen read " + spanOfLastReadSeen); - spanOfLastReadSeen = currentLocation; - } - } - - /** - * Update the live region to reflect that we've reached locus - * - * This function is complementary to #rememberLastReadLocation, but if we don't have any reads for a long - * time (e.g., there's no coverage) we will keep active regions around far longer than necessary. - * - * Only updates the span if it's beyond the last seen - * - * @param currentLocation the current location we've processed on the genome - */ - protected void rememberLastLocusLocation(final GenomeLoc currentLocation) { - if ( spanOfLastReadSeen == null ) - spanOfLastReadSeen = currentLocation; - else { - if ( currentLocation.isPast(spanOfLastReadSeen) ) - spanOfLastReadSeen = currentLocation; - } - } - - - /** - * Get a GenomeLoc indicating the start (heading to the right) of the live ART region. - * @return the left-most position of the live region on the genome - */ - protected GenomeLoc spanOfLastSeenRead() { - return spanOfLastReadSeen; - } - - /** - * Is the active region completely within the traversal's dead zone? - * - * @param region the region we want to test - * @return true if the extended location of region is completely within the current dead zone, false otherwise - */ - protected boolean regionCompletelyWithinDeadZone(final ActiveRegion region) { - if ( spanOfLastSeenRead() == null ) - return false; - - final int contigCmp = region.getExtendedLoc().compareContigs(spanOfLastSeenRead()); - if ( contigCmp > 0 ) - throw new IllegalStateException("Active region " + region + " on a contig after last seen read " + spanOfLastSeenRead()); - else { - return contigCmp < 0 || region.getExtendedLoc().getStop() < spanOfLastSeenRead().getStart(); - } - } - - /** - * Is the read dead? That is, can it no longer be in any future active region, and therefore can be discarded? - * - * read: start |--------> stop ------ stop + extension - * region: start |-----------------| end - * - * Since the regions are coming in order, read could potentially be contained in a future interval if - * stop + activeRegionExtension >= end. If, on the other hand, stop + extension is < the end - * of this region, then we can discard it, since any future region could only include reads - * up to end + 1 - extension. - * - * Note that this function doesn't care about the dead zone. We're assuming that by - * actually calling this function with an active region that region is already in the dead zone, - * so checking that the read is in the dead zone doesn't make sense. - * - * @param read the read we're testing - * @param activeRegion the current active region - * @return true if the read is dead, false other - */ - @Requires({"read != null", "activeRegion != null"}) - private boolean readCannotOccurInAnyMoreActiveRegions(final GATKSAMRecord read, final ActiveRegion activeRegion) { - return read.getReferenceIndex() < activeRegion.getLocation().getContigIndex() || - ( read.getReferenceIndex() == activeRegion.getLocation().getContigIndex() - && read.getAlignmentEnd() + getActiveRegionExtension() < activeRegion.getLocation().getStop() ); - } - - // ------------------------------------------------------------------------------------- - // - // Functions to write out activity profiles and active regions - // - // ------------------------------------------------------------------------------------- - - /** - * Initialize the debugging output streams (activity profile and active regions), if not done so already - */ - @Ensures("streamsInitialized == true") - private void initializeOutputStreamsIfNecessary() { - if ( ! streamsInitialized ) { - streamsInitialized = true; - if ( walker.activityProfileOutStream != null ) { - printIGVFormatHeader(walker.activityProfileOutStream, "line", "ActivityProfile"); - } - - if ( walker.activeRegionOutStream != null ) { - printIGVFormatHeader(walker.activeRegionOutStream, "line", "ActiveRegions"); - } - } - } - - /** - * Helper function to write out a IGV formatted line to out, at loc, with values - * - * http://www.broadinstitute.org/software/igv/IGV - * - * @param out a non-null PrintStream where we'll write our line - * @param graphType the type of graph to show in IGV for this track - * @param columns the column names for this IGV track - */ - @Requires({ - "out != null", - "graphType != null", - "columns.length > 0" - }) - private void printIGVFormatHeader(final PrintStream out, final String graphType, final String ... columns ) { - out.printf("#track graphType=%s%n", graphType); - out.printf("Chromosome\tStart\tEnd\tFeature\t%s%n", Utils.join("\t", columns)); - - } - - /** - * Helper function to write out a IGV formatted line to out, at loc, with values - * - * http://www.broadinstitute.org/software/igv/IGV - * - * @param out a non-null PrintStream where we'll write our line - * @param loc the location of values - * @param featureName string name of this feature (see IGV format) - * @param values the floating point values to associate with loc and feature name in out - */ - @Requires({ - "out != null", - "loc != null", - "values.length > 0" - }) - private void printIGVFormatRow(final PrintStream out, final GenomeLoc loc, final String featureName, final double ... values) { - // note that start and stop are 0 based, but the stop is exclusive so we don't subtract 1 - out.printf("%s\t%d\t%d\t%s", loc.getContig(), loc.getStart() - 1, loc.getStop(), featureName); - for ( final double value : values ) - out.print(String.format("\t%.5f", value)); - out.println(); - } - - /** - * Write out activity profile information, if requested by the walker - * - * @param states the states in the current activity profile - */ - @Requires("states != null") - private void writeActivityProfile(final List states) { - if ( walker.activityProfileOutStream != null ) { - initializeOutputStreamsIfNecessary(); - for ( final ActivityProfileState state : states ) { - printIGVFormatRow(walker.activityProfileOutStream, state.getLoc(), "state", Math.min(state.isActiveProb, 1.0)); - } - } - } - - /** - * Write out each active region to the walker activeRegionOutStream - * - * @param region the region we're currently operating on - */ - @Requires("region != null") - private void writeActiveRegion(final ActiveRegion region) { - if( walker.activeRegionOutStream != null ) { - initializeOutputStreamsIfNecessary(); - printIGVFormatRow(walker.activeRegionOutStream, region.getLocation().getStartLocation(), - "end-marker", 0.0); - printIGVFormatRow(walker.activeRegionOutStream, region.getLocation(), - "size=" + region.getLocation().size(), region.isActive() ? 1.0 : -1.0); - } - } - - - // ------------------------------------------------------------------------------------- - // - // Functions to process active regions that are ready for map / reduce calls - // - // ------------------------------------------------------------------------------------- - - /** - * Invoke the walker isActive function, and incorporate its result into the activity profile - * - * @param walker the walker we're running - * @param tracker the ref meta data tracker to pass on to the isActive function of walker - * @param refContext the refContext to pass on to the isActive function of walker - * @param locus the AlignmentContext to pass on to the isActive function of walker - */ - private void addIsActiveResult(final ActiveRegionWalker walker, - final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext locus) { - // must be called, even if we won't use the result, to satisfy walker contract - final ActivityProfileState state = walker.isActive( tracker, refContext, locus ); - if ( walker.forceActive) state.isActiveProb = 1.0; - if ( ! walkerHasPresetRegions ) { - activityProfile.add(state); - } - } - - /** - * Take the individual isActive calls and integrate them into contiguous active regions and - * add these blocks of work to the work queue - * band-pass filter the list of isActive probabilities and turn into active regions - */ - private List prepActiveRegionsForProcessing(final ActiveRegionWalker walker, - final boolean flushActivityProfile, - final boolean forceAllRegionsToBeActive, - final IntervalReferenceOrderedView referenceOrderedDataView) { - if ( ! walkerHasPresetRegions ) { - // We don't have preset regions, so we get our regions from the activity profile - final Collection activeRegions = activityProfile.popReadyActiveRegions(getActiveRegionExtension(), getMinRegionSize(), getMaxRegionSize(), flushActivityProfile); - workQueue.addAll(activeRegions); - if ( ! activeRegions.isEmpty() && logger.isDebugEnabled() ) logger.debug("Integrated " + activityProfile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - } - - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them - final LinkedList readyRegions = new LinkedList<>(); - while( workQueue.peek() != null ) { - final ActiveRegion activeRegion = workQueue.peek(); - if ( forceAllRegionsToBeActive || regionCompletelyWithinDeadZone(activeRegion) ) { - writeActivityProfile(activeRegion.getSupportingStates()); - writeActiveRegion(activeRegion); - readyRegions.add(prepActiveRegionForProcessing(workQueue.remove(), walker, referenceOrderedDataView)); - } else { - break; - } - } - - return readyRegions; - - } - - private MapData prepActiveRegionForProcessing(final ActiveRegion activeRegion, - final ActiveRegionWalker walker, - final IntervalReferenceOrderedView referenceOrderedDataView) { - final List stillLive = new LinkedList<>(); - for ( final GATKSAMRecord read : myReads.popCurrentReads() ) { - boolean killed = false; - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - - if( activeRegion.getLocation().overlapsP( readLoc ) ) { - activeRegion.add(read); - - if ( ! walker.wantsNonPrimaryReads() ) { - killed = true; - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - - // if the read hasn't already been killed, check if it cannot occur in any more active regions, and maybe kill it - if ( ! killed && readCannotOccurInAnyMoreActiveRegions(read, activeRegion) ) { - killed = true; - } - - // keep track of all of the still live active regions - if ( ! killed ) stillLive.add(read); - } - myReads.addAll(stillLive); - - if ( logger.isDebugEnabled() ) { - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive() ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReadSpanLoc()); - } - - if ( LOG_READ_CARRYING ) - logger.info(String.format("Processing region %20s span=%3d active?=%5b with %4d reads. Overall max reads carried is %s", - activeRegion.getLocation(), activeRegion.getLocation().size(), activeRegion.isActive(), activeRegion.size(), maxReadsInMemory)); - - // prepare the RefMetaDataTracker information - final GenomeLoc loc = activeRegion.getLocation(); - // get all of the RODs that cover the active region (without extension) - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataForInterval(loc); - // trim away all of the features that occurred before this location, as we will not need them in the future - referenceOrderedDataView.trimCurrentFeaturesToLoc(loc); - - return new MapData(activeRegion, tracker); - } - - private class TraverseActiveRegionMap implements NSMapFunction { - @Override - public M apply(final MapData mapData) { - if ( DEBUG ) logger.info("Executing walker.map for " + mapData.activeRegion + " in thread " + Thread.currentThread().getName()); - return walker.map(mapData.activeRegion, mapData.tracker); - } - } - - private class TraverseActiveRegionReduce implements NSReduceFunction { - @Override - public T apply(M one, T sum) { - return walker.reduce(one, sum); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java deleted file mode 100644 index 6cffe9427..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseDuplicates.java +++ /dev/null @@ -1,205 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ReadView; -import org.broadinstitute.gatk.engine.iterators.PushbackIterator; -import org.broadinstitute.gatk.engine.walkers.DuplicateWalker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.*; - -/** - * @author Mark DePristo - * @version 0.1 - *

- * Class TraverseDuplicates - *

- * This class handles traversing lists of duplicate reads in the new shardable style - */ -public class TraverseDuplicates extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(TraverseDuplicates.class); - - /** Turn this to true to enable logger.debug output */ - private final boolean DEBUG = false; - - @Override - public String getTraversalUnits() { - return "dups"; - } - - private List readsAtLoc(final GATKSAMRecord read, PushbackIterator iter) { - GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); - ArrayList l = new ArrayList(); - - l.add(read); - for (SAMRecord read2 : iter) { - GenomeLoc site2 = engine.getGenomeLocParser().createGenomeLoc(read2); - - // the next read starts too late - if (site2.getStart() != site.getStart()) { - iter.pushback(read2); - break; - } else { - l.add((GATKSAMRecord) read2); - } - } - - return l; - } - - /** - * Creates a set of lists of reads, where each list contains reads from the same underlying molecule according - * to their duplicate flag and their (and mate, if applicable) start/end positions. - * - * @param reads the list of reads to split into unique molecular samples - * @return - */ - protected Set> uniqueReadSets(List reads) { - Set> readSets = new LinkedHashSet>(); - - // for each read, find duplicates, and either add the read to its duplicate list or start a new one - for ( GATKSAMRecord read : reads ) { - List readSet = findDuplicateReads(read, readSets); - - if ( readSet == null ) { - readSets.add(new ArrayList(Arrays.asList(read))); // copy so I can add to the list - } else { - readSet.add(read); - } - } - - return readSets; - } - - /** - * Find duplicate reads for read in the set of unique reads. This is effective a duplicate marking algorithm, - * but it relies for safety's sake on the file itself being marked by a true duplicate marking algorithm. Pair - * and single-end read aware. - * - * @param read - * @param readSets - * @return The list of duplicate reads that read is a member of, or null if it's the only one of its kind - */ - protected List findDuplicateReads(GATKSAMRecord read, Set> readSets ) { - if ( read.getReadPairedFlag() ) { - // paired - final GenomeLoc readMateLoc = engine.getGenomeLocParser().createGenomeLoc(read.getMateReferenceName(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); - - for (List reads : readSets) { - GATKSAMRecord key = reads.get(0); - - // read and key start at the same place, and either the this read and the key - // share a mate location or the read is flagged as a duplicate - if ( read.getAlignmentStart() == key.getAlignmentStart() && key.getReadPairedFlag() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) ) { - // at least one has to be marked as a duplicate - final GenomeLoc keyMateLoc = engine.getGenomeLocParser().createGenomeLoc(key.getMateReferenceName(), key.getMateAlignmentStart(), key.getMateAlignmentStart()); - if ( readMateLoc.compareTo(keyMateLoc) == 0 ) { - // we are at the same position as the dup and have the same mat pos, it's a dup - if (DEBUG) logger.debug(String.format(" => Adding read to dups list: %s %d %s vs. %s", read, reads.size(), readMateLoc, keyMateLoc)); - return reads; - } - } - } - } else { - for (List reads : readSets) { - GATKSAMRecord key = reads.get(0); - boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength(); - //System.out.printf("%s %s %b %b %d %d %d %d => %b%n", - // read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(), - // read.getAlignmentStart(), key.getAlignmentStart(), read.getReadLength(), key.getReadLength(), v); - if ( v ) { - //System.out.printf("Returning reads...%n"); - return reads; - } - } - } - - return null; - } - - // -------------------------------------------------------------------------------------------------------------- - // - // new style interface to the system - // - // -------------------------------------------------------------------------------------------------------------- - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to execute over - * @param sum of type T, the return from the walker - * - * @return the result type T, the product of all the reduce calls - */ - public T traverse(DuplicateWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - PushbackIterator iter = new PushbackIterator(new ReadView(dataProvider).iterator()); - - /** - * while we still have more reads: - * ok, here's the idea. We get all the reads that start at the same position in the genome - * We then split the list of reads into sublists of reads: - * -> those with the same mate pair position, for paired reads - * -> those flagged as unpaired and duplicated but having the same start and end - */ - boolean done = walker.isDone(); - for (SAMRecord read : iter) { - if ( done ) break; - // get the genome loc from the read - GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); - - Set> readSets = uniqueReadSets(readsAtLoc((GATKSAMRecord) read, iter)); - if ( DEBUG ) logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size())); - - // Jump forward in the reference to this locus location - AlignmentContext locus = new AlignmentContext(site, new ReadBackedPileupImpl(site)); - - // update the number of duplicate sets we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // actually call filter and map, accumulating sum - final boolean keepMeP = walker.filter(site, locus, readSets); - if (keepMeP) { - M x = walker.map(site, locus, readSets); - sum = walker.reduce(x, sum); - } - - printProgress(site.getStopLocation()); - done = walker.isDone(); - } - - return sum; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java deleted file mode 100644 index 02c1a7e7f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseLociNano.java +++ /dev/null @@ -1,304 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import org.broadinstitute.gatk.engine.WalkerManager; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.DataSource; -import org.broadinstitute.gatk.engine.walkers.LocusWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; - -import java.util.Iterator; - -/** - * A simple solution to iterating over all reference positions over a series of genomic locations. - */ -public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { - /** our log, which we want to capture anything from this class */ - private static final boolean DEBUG = false; - - final NanoScheduler nanoScheduler; - - public TraverseLociNano(int nThreads) { - nanoScheduler = new NanoScheduler(nThreads); - nanoScheduler.setProgressFunction(new TraverseLociProgress()); - } - - @Override - public final String getTraversalUnits() { - return "sites"; - } - - protected static class TraverseResults { - final int numIterations; - final T reduceResult; - - public TraverseResults(int numIterations, T reduceResult) { - this.numIterations = numIterations; - this.reduceResult = reduceResult; - } - } - - @Override - public T traverse( LocusWalker walker, - LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); - - final LocusView locusView = getLocusView( walker, dataProvider ); - - if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); - sum = result.reduceResult; - dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); - } - - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { - // only do this if the walker isn't done! - final RodLocusView rodLocusView = (RodLocusView)locusView; - final long nSkipped = rodLocusView.getLastSkippedBases(); - if ( nSkipped > 0 ) { - final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - final M x = walker.map(null, null, ac); - sum = walker.reduce(x, sum); - } - } - - return sum; - } - - /** - * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype - * that comes along. - * @param walker walker to interrogate. - * @param dataProvider Data which which to drive the locus view. - * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. - */ - private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } - - protected TraverseResults traverse(final LocusWalker walker, - final LocusView locusView, - final LocusReferenceView referenceView, - final ReferenceOrderedView referenceOrderedDataView, - final T sum) { - nanoScheduler.setDebug(DEBUG); - final TraverseLociMap myMap = new TraverseLociMap(walker); - final TraverseLociReduce myReduce = new TraverseLociReduce(walker); - - final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); - final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); - - return new TraverseResults(inputIterator.numIterations, result); - } - - /** - * Create iterator that provides inputs for all map calls into MapData, to be provided - * to NanoScheduler for Map/Reduce - */ - private class MapDataIterator implements Iterator { - final LocusView locusView; - final LocusReferenceView referenceView; - final ReferenceOrderedView referenceOrderedDataView; - int numIterations = 0; - - private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { - this.locusView = locusView; - this.referenceView = referenceView; - this.referenceOrderedDataView = referenceOrderedDataView; - } - - @Override - public boolean hasNext() { - return locusView.hasNext() && ! engine.exceedsRuntimeLimit(); - } - - @Override - public MapData next() { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - //logger.info("Pulling data from MapDataIterator at " + location); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location); - - numIterations++; - return new MapData(locus, refContext, tracker); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); - } - } - - @Override - public void shutdown() { - nanoScheduler.shutdown(); - } - - /** - * The input data needed for each map call. The read, the reference, and the RODs - */ - private class MapData { - final AlignmentContext alignmentContext; - final ReferenceContext refContext; - final RefMetaDataTracker tracker; - - private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { - this.alignmentContext = alignmentContext; - this.refContext = refContext; - this.tracker = tracker; - } - - @Override - public String toString() { - return "MapData " + alignmentContext.getLocation(); - } - } - - /** - * Contains the results of a map call, indicating whether the call was good, filtered, or done - */ - private class MapResult { - final M value; - final boolean reduceMe; - - /** - * Create a MapResult with value that should be reduced - * - * @param value the value to reduce - */ - private MapResult(final M value) { - this.value = value; - this.reduceMe = true; - } - - /** - * Create a MapResult that shouldn't be reduced - */ - private MapResult() { - this.value = null; - this.reduceMe = false; - } - } - - /** - * A static object that tells reduce that the result of map should be skipped (filtered or done) - */ - private final MapResult SKIP_REDUCE = new MapResult(); - - /** - * MapFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Applies walker.map to MapData, returning a MapResult object containing the result - */ - private class TraverseLociMap implements NSMapFunction { - final LocusWalker walker; - - private TraverseLociMap(LocusWalker walker) { - this.walker = walker; - } - - @Override - public MapResult apply(final MapData data) { - if ( ! walker.isDone() ) { - final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); - if (keepMeP) { - final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); - return new MapResult(x); - } - } - return SKIP_REDUCE; - } - } - - /** - * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable - */ - private class TraverseLociReduce implements NSReduceFunction { - final LocusWalker walker; - - private TraverseLociReduce(LocusWalker walker) { - this.walker = walker; - } - - @Override - public T apply(MapResult one, T sum) { - if ( one.reduceMe ) - // only run reduce on values that aren't DONE or FAILED - return walker.reduce(one.value, sum); - else - return sum; - } - } - - private class TraverseLociProgress implements NSProgressFunction { - @Override - public void progress(MapData lastProcessedMap) { - if (lastProcessedMap.alignmentContext != null) - printProgress(lastProcessedMap.alignmentContext.getLocation()); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java deleted file mode 100644 index 2ce752b1f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsNano.java +++ /dev/null @@ -1,256 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.ReadBasedReferenceOrderedView; -import org.broadinstitute.gatk.engine.datasources.providers.ReadReferenceView; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ReadView; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.ReadWalker; -import org.broadinstitute.gatk.utils.nanoScheduler.NSMapFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSProgressFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NSReduceFunction; -import org.broadinstitute.gatk.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Iterator; -import java.util.LinkedList; - -/** - * A nano-scheduling version of TraverseReads. - * - * Implements the traversal of a walker that accepts individual reads, the reference, and - * RODs per map call. Directly supports shared memory parallelism via NanoScheduler - * - * @author depristo - * @version 1.0 - * @date 9/2/2012 - */ -public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - private final static boolean PRE_READ_ALL_MAP_DATA = true; - protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); - private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; - - public TraverseReadsNano(int nThreads) { - nanoScheduler = new NanoScheduler(nThreads); - nanoScheduler.setProgressFunction(new NSProgressFunction() { - @Override - public void progress(MapData lastProcessedMap) { - if ( lastProcessedMap.refContext != null ) - // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon - printProgress(lastProcessedMap.refContext.getLocus().getStopLocation()); - } - }); - } - - @Override - public String getTraversalUnits() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * @return the reduce variable of the read walker - */ - public T traverse(ReadWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - if ( logger.isDebugEnabled() ) - logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); - - if( !dataProvider.hasReads() ) - throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - - nanoScheduler.setDebug(DEBUG); - final TraverseReadsMap myMap = new TraverseReadsMap(walker); - final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - - final Iterator aggregatedInputs = aggregateMapData(dataProvider); - final T result = nanoScheduler.execute(aggregatedInputs, myMap, sum, myReduce); - - return result; - } - - /** - * Aggregate all of the inputs for all map calls into MapData, to be provided - * to NanoScheduler for Map/Reduce - * - * @param dataProvider the source of our data - * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce - * should execute - */ - private Iterator aggregateMapData(final ReadShardDataProvider dataProvider) { - final Iterator it = makeDataIterator(dataProvider); - if ( PRE_READ_ALL_MAP_DATA ) { - final LinkedList l = new LinkedList(); - while ( it.hasNext() ) l.add(it.next()); - return l.iterator(); - } else { - return it; - } - } - - - private Iterator makeDataIterator(final ReadShardDataProvider dataProvider) { - return new Iterator () { - final ReadView reads = new ReadView(dataProvider); - final ReadReferenceView reference = new ReadReferenceView(dataProvider); - final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - final Iterator readIterator = reads.iterator(); - - @Override public boolean hasNext() { return ! engine.exceedsRuntimeLimit() && readIterator.hasNext(); } - - @Override - public MapData next() { - final SAMRecord read = readIterator.next(); - final ReferenceContext refContext = ! read.getReadUnmappedFlag() - ? reference.getReferenceContext(read) - : null; - - // if the read is mapped, create a metadata tracker - final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 - ? rodView.getReferenceOrderedDataForRead(read) - : null; - - // update the number of reads we've seen - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - return new MapData((GATKSAMRecord)read, refContext, tracker); - } - - @Override public void remove() { - throw new UnsupportedOperationException("Remove not supported"); - } - }; - } - - @Override - public void shutdown() { - nanoScheduler.shutdown(); - } - - /** - * The input data needed for each map call. The read, the reference, and the RODs - */ - private class MapData { - final GATKSAMRecord read; - final ReferenceContext refContext; - final RefMetaDataTracker tracker; - - private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { - this.read = read; - this.refContext = refContext; - this.tracker = tracker; - } - } - - /** - * Contains the results of a map call, indicating whether the call was good, filtered, or done - */ - private class MapResult { - final M value; - final boolean reduceMe; - - /** - * Create a MapResult with value that should be reduced - * - * @param value the value to reduce - */ - private MapResult(final M value) { - this.value = value; - this.reduceMe = true; - } - - /** - * Create a MapResult that shouldn't be reduced - */ - private MapResult() { - this.value = null; - this.reduceMe = false; - } - } - - /** - * A static object that tells reduce that the result of map should be skipped (filtered or done) - */ - private final MapResult SKIP_REDUCE = new MapResult(); - - /** - * MapFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Applies walker.map to MapData, returning a MapResult object containing the result - */ - private class TraverseReadsMap implements NSMapFunction { - final ReadWalker walker; - - private TraverseReadsMap(ReadWalker walker) { - this.walker = walker; - } - - @Override - public MapResult apply(final MapData data) { - if ( ! walker.isDone() ) { - final boolean keepMeP = walker.filter(data.refContext, data.read); - if (keepMeP) - return new MapResult(walker.map(data.refContext, data.read, data.tracker)); - } - - return SKIP_REDUCE; - } - } - - /** - * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements - * - * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable - */ - private class TraverseReadsReduce implements NSReduceFunction { - final ReadWalker walker; - - private TraverseReadsReduce(ReadWalker walker) { - this.walker = walker; - } - - @Override - public T apply(MapResult one, T sum) { - if ( one.reduceMe ) - // only run reduce on values that aren't DONE or FAILED - return walker.reduce(one.value, sum); - else - return sum; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java deleted file mode 100644 index 9ff68bc9e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ActiveRegionWalker.java +++ /dev/null @@ -1,196 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import com.google.java.contract.Ensures; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.filters.*; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.interval.IntervalSetRule; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; - -import java.io.PrintStream; -import java.util.*; - -/** - * Base class for all the Active Region Walkers. - * User: rpoplin - * Date: 12/7/11 - */ - -@By(DataSource.READS) -@Requires({DataSource.READS, DataSource.REFERENCE}) -@PartitionBy(PartitionType.READ) -@ActiveRegionTraversalParameters(extension=50,maxRegion=1500) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) -@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) -@RemoveProgramRecords -public abstract class ActiveRegionWalker extends Walker { - /** - * If provided, this walker will write out its activity profile (per bp probabilities of being active) - * to this file in the IGV formatted TAB deliminated output: - * - * http://www.broadinstitute.org/software/igv/IGV - * - * Intended to make debugging the activity profile calculations easier - */ - @Output(fullName="activityProfileOut", shortName="APO", doc="Output the raw activity profile results in IGV format", required = false, defaultToStdout = false) - public PrintStream activityProfileOutStream = null; - - /** - * If provided, this walker will write out its active and inactive regions - * to this file in the IGV formatted TAB deliminated output: - * - * http://www.broadinstitute.org/software/igv/IGV - * - * Intended to make debugging the active region calculations easier - */ - @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this IGV formatted file", required = false, defaultToStdout = false) - public PrintStream activeRegionOutStream = null; - - @Advanced - @Input(fullName="activeRegionIn", shortName="AR", doc="Use this interval list file as the active regions to process", required = false) - protected List> activeRegionBindings = null; - - @Advanced - @Argument(fullName="activeRegionExtension", shortName="activeRegionExtension", doc="The active region extension; if not provided defaults to Walker annotated default", required = false) - public Integer activeRegionExtension = null; - - /** - * For the active region walker to treat all bases as active. Useful for debugging when you want to force something like - * the HaplotypeCaller to process a specific interval you provide the GATK - */ - @Advanced - @Argument(fullName="forceActive", shortName="forceActive", doc="If provided, all bases will be tagged as active", required = false) - public boolean forceActive = false; - - @Advanced - @Argument(fullName="activeRegionMaxSize", shortName="activeRegionMaxSize", doc="The active region maximum size; if not provided defaults to Walker annotated default", required = false) - public Integer activeRegionMaxSize = null; - - @Advanced - @Argument(fullName="bandPassSigma", shortName="bandPassSigma", doc="The sigma of the band pass filter Gaussian kernel; if not provided defaults to Walker annotated default", required = false) - public Double bandPassSigma = null; - - /* - * For active region limits in ActivityProfile -* */ - @Hidden - @Argument(fullName = "maxProbPropagationDistance", shortName = "maxProbPropDist", minValue = 0, doc="Region probability propagation distance beyond it's maximum size.", required = false) - public Integer maxProbPropagationDistance = 50; - - @Advanced - @Argument(fullName = "activeProbabilityThreshold", shortName = "ActProbThresh", minValue = 0.0, maxValue = 1.0, doc="Threshold for the probability of a profile state being active.", required = false) - public Double activeProbThreshold = 0.002; - - private GenomeLocSortedSet presetActiveRegions = null; - - @Override - public void initialize() { - if( activeRegionBindings == null ) { return; } - List allIntervals = new ArrayList(0); - for ( IntervalBinding intervalBinding : activeRegionBindings ) { - List intervals = intervalBinding.getIntervals(this.getToolkit()); - - if ( intervals.isEmpty() ) { - logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); - } - - allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, IntervalSetRule.UNION); - } - - presetActiveRegions = IntervalUtils.sortAndMergeIntervals(this.getToolkit().getGenomeLocParser(), allIntervals, IntervalMergingRule.ALL); - } - - /** - * Does this walker want us to use a set of preset action regions instead of dynamically using the result of isActive? - * @return true if yes, false if no - */ - public boolean hasPresetActiveRegions() { - return presetActiveRegions != null; - } - - /** - * Get the set of preset active regions, or null if none were provided - * @return a set of genome locs specifying fixed active regions requested by the walker, or null if none exist - */ - public GenomeLocSortedSet getPresetActiveRegions() { - return presetActiveRegions; - } - - // Do we actually want to operate on the context? - public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { - return true; // We are keeping all the reads - } - - public EnumSet desiredReadStates() { - return EnumSet.of(ActiveRegionReadState.PRIMARY); - } - - public final boolean wantsNonPrimaryReads() { - return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY); - } - - public boolean wantsExtendedReads() { - return desiredReadStates().contains(ActiveRegionReadState.EXTENDED); - } - - public boolean wantsUnmappedReads() { - return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED); - } - - // Determine probability of active status over the AlignmentContext - @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) - public abstract ActivityProfileState isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); - - // Map over the ActiveRegion - public abstract MapType map(final ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker); - - public final GenomeLocSortedSet extendIntervals( final GenomeLocSortedSet intervals, final GenomeLocParser genomeLocParser, IndexedFastaSequenceFile reference ) { - final int activeRegionExtension = this.getClass().getAnnotation(ActiveRegionTraversalParameters.class).extension(); - final List allIntervals = new ArrayList(); - for( final GenomeLoc interval : intervals.toList() ) { - final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); - final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); - allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); - } - return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); - } - - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java deleted file mode 100644 index c112d7d26..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Downsample.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; - -import java.lang.annotation.*; - -/** - * Specifies a method for downsampling the reads passed to a given - * walker based on the input from that walker. - * - * @author hanna - * @version 0.1 - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface Downsample { - DownsampleType by(); - int toCoverage() default -1; - double toFraction() default -1.0F; -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java deleted file mode 100644 index 96d2d5dad..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/DuplicateWalker.java +++ /dev/null @@ -1,57 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.List; -import java.util.Set; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ -@Requires({DataSource.READS,DataSource.REFERENCE}) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class}) -public abstract class DuplicateWalker extends Walker { - // Do we actually want to operate on the context? - public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { - return true; // We are keeping all the reads - } - - public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set> readSets ); - - // Given result of map function - public abstract ReduceType reduceInit(); - public abstract ReduceType reduce(MapType value, ReduceType sum); -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java deleted file mode 100644 index 1e7b0e54c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/LocusWalker.java +++ /dev/null @@ -1,58 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.filters.DuplicateReadFilter; -import org.broadinstitute.gatk.engine.filters.FailsVendorQualityCheckFilter; -import org.broadinstitute.gatk.engine.filters.NotPrimaryAlignmentFilter; -import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ -@By(DataSource.READS) -@Requires({DataSource.READS,DataSource.REFERENCE}) -@PartitionBy(PartitionType.LOCUS) -@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) -@Downsample(by = DownsampleType.BY_SAMPLE, toCoverage = 1000) -@RemoveProgramRecords -public abstract class LocusWalker extends Walker { - // Do we actually want to operate on the context? - public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return true; // We are keeping all the reads - } - - // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext - public abstract MapType map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplex.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplex.java deleted file mode 100644 index e771d1ed8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplex.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import java.lang.annotation.*; - -/** - * Indicates that the class should be multiplexed according to the rules - * specified in the multiplexer. - * - * @author mhanna - * @version 0.1 - */ -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target({ElementType.FIELD}) -public @interface Multiplex { - public Class value(); - public String[] arguments() default {}; -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplexer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplexer.java deleted file mode 100644 index 969e288a5..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Multiplexer.java +++ /dev/null @@ -1,52 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import java.util.Collection; - -/** - * An interface for multiplexing output streams. - * - * @author mhanna - * @version 0.1 - */ -public interface Multiplexer { - /** - * Generate a list of the potential outputs that can be created as a function of the other - * command-line arguments in this class. - * @return A collection of unique identifiers for the file multiplex. - */ - public Collection multiplex(); - - /** - * Transform the given command-line argument into a suitable form specific to this filename. - * @param multiplexedEntry Identifies the individual component of the multiplex. Will be a value in the collection - * passed back by multiplex(). - * @param argument The actual command-line argument, supplied for transformation. - * @return A transformed representation of the command-line argument. - */ - public String transformArgument(final T multiplexedEntry, final String argument); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java deleted file mode 100644 index 9528cf18e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/ReadWalker.java +++ /dev/null @@ -1,55 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: mdepristo - * Date: Feb 22, 2009 - * Time: 2:52:28 PM - * To change this template use File | Settings | File Templates. - */ -@Requires({DataSource.READS, DataSource.REFERENCE}) -@PartitionBy(PartitionType.READ) -public abstract class ReadWalker extends Walker { - public boolean requiresOrderedReads() { return false; } - - // Do we actually want to operate on the context? - /** Must return true for reads that need to be processed. Reads, for which this method return false will - * be skipped by the engine and never passed to the walker. - */ - public boolean filter(ReferenceContext ref, GATKSAMRecord read) { - // We are keeping all the reads - return true; - } - - // Map over the org.broadinstitute.gatk.engine.contexts.AlignmentContext - public abstract MapType map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java deleted file mode 100644 index 31472fdfd..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/Walker.java +++ /dev/null @@ -1,177 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.filters.MalformedReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.samples.Sample; -import org.broadinstitute.gatk.engine.samples.SampleDB; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.baq.BAQ; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; -import org.broadinstitute.gatk.utils.recalibration.BQSRMode; - -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: hanna - * Date: Mar 17, 2009 - * Time: 1:53:31 PM - * To change this template use File | Settings | File Templates. - */ -@ReadFilters(MalformedReadFilter.class) -@PartitionBy(PartitionType.NONE) -@Downsample(by = DownsampleType.NONE) -@BAQMode(QualityMode = BAQ.QualityMode.OVERWRITE_QUALS, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) -@BQSRMode(ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) -@DocumentedGATKFeature(groupName = "Uncategorized", extraDocs = {CommandLineGATK.class}) -public abstract class Walker { - final protected static Logger logger = Logger.getLogger(Walker.class); - private GenomeAnalysisEngine toolkit; - - protected Walker() { - } - - /** - * Set the toolkit, for peering into internal structures that can't - * otherwise be read. - * @param toolkit The genome analysis toolkit. - */ - public void setToolkit(GenomeAnalysisEngine toolkit) { - this.toolkit = toolkit; - } - - /** - * Retrieve the toolkit, for peering into internal structures that can't - * otherwise be read. Use sparingly, and discuss uses with software engineering - * team. - * @return The genome analysis toolkit. - */ - protected GenomeAnalysisEngine getToolkit() { - return toolkit; - } - - /** - * Gets the master sequence dictionary for this walker - * @link GenomeAnalysisEngine.getMasterSequenceDictionary - * @return - */ - protected SAMSequenceDictionary getMasterSequenceDictionary() { - return getToolkit().getMasterSequenceDictionary(); - } - - public SampleDB getSampleDB() { - return getToolkit().getSampleDB(); - } - - protected Sample getSample(final String id) { - return getToolkit().getSampleDB().getSample(id); - } - - /** - * (conceptual static) method that states whether you want to see reads piling up at a locus - * that contain a deletion at the locus. - * - * ref: ATCTGA - * read1: ATCTGA - * read2: AT--GA - * - * Normally, the locus iterator only returns a list of read1 at this locus at position 3, but - * if this function returns true, then the system will return (read1, read2) with offsets - * of (3, -1). The -1 offset indicates a deletion in the read. - * - * @return false if you don't want to see deletions, or true if you do - */ - public boolean includeReadsWithDeletionAtLoci() { - return false; - } - - public void initialize() { } - - /** - * A function for overloading in subclasses providing a mechanism to abort early from a walker. - * - * If this ever returns true, then the Traversal engine will stop executing map calls - * and start the process of shutting down the walker in an orderly fashion. - * @return - */ - public boolean isDone() { - return false; - } - - /** - * Provide an initial value for reduce computations. - * @return Initial value of reduce. - */ - public abstract ReduceType reduceInit(); - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public abstract ReduceType reduce(MapType value, ReduceType sum); - - public void onTraversalDone(ReduceType result) { - logger.info("[REDUCE RESULT] Traversal result is: " + result); - } - - /** - * General interval reduce routine called after all of the traversals are done - * @param results interval reduce results - */ - public void onTraversalDone(List> results) { - for ( Pair result : results ) { - logger.info(String.format("[INTERVAL REDUCE RESULT] at %s ", result.getFirst())); - this.onTraversalDone(result.getSecond()); - } - } - - /** - * Return true if your walker wants to reduce each interval separately. Default is false. - * - * If you set this flag, several things will happen. - * - * The system will invoke reduceInit() once for each interval being processed, starting a fresh reduce - * Reduce will accumulate normally at each map unit in the interval - * However, onTraversalDone(reduce) will be called after each interval is processed. - * The system will call onTraversalDone( GenomeLoc -> reduce ), after all reductions are done, - * which is overloaded here to call onTraversalDone(reduce) for each location - * - * @return true if your walker wants to reduce each interval separately. - */ - public boolean isReduceByInterval() { - return false; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/BAMDiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/BAMDiffableReader.java deleted file mode 100644 index 2c8cc7ae1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/BAMDiffableReader.java +++ /dev/null @@ -1,119 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordIterator; -import htsjdk.samtools.ValidationStringency; -import htsjdk.samtools.util.BlockCompressedInputStream; - -import java.io.*; -import java.util.Arrays; - - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 1:09 PM - * - * Class implementing diffnode reader for VCF - */ -public class BAMDiffableReader implements DiffableReader { - @Override - public String getName() { return "BAM"; } - - @Override - public DiffElement readFromFile(File file, int maxElementsToRead) { - final SAMFileReader reader = new SAMFileReader(file, null); // null because we don't want it to look for the index - reader.setValidationStringency(ValidationStringency.SILENT); - - DiffNode root = DiffNode.rooted(file.getName()); - SAMRecordIterator iterator = reader.iterator(); - - int count = 0; - while ( iterator.hasNext() ) { - final SAMRecord record = iterator.next(); - - // name is the read name + first of pair - String name = record.getReadName().replace('.', '_'); - if ( record.getReadPairedFlag() ) { - name += record.getFirstOfPairFlag() ? "_1" : "_2"; - } - - DiffNode readRoot = DiffNode.empty(name, root); - - // add fields - readRoot.add("NAME", record.getReadName()); - readRoot.add("FLAGS", record.getFlags()); - readRoot.add("RNAME", record.getReferenceName()); - readRoot.add("POS", record.getAlignmentStart()); - readRoot.add("MAPQ", record.getMappingQuality()); - readRoot.add("CIGAR", record.getCigarString()); - readRoot.add("RNEXT", record.getMateReferenceName()); - readRoot.add("PNEXT", record.getMateAlignmentStart()); - readRoot.add("TLEN", record.getInferredInsertSize()); - readRoot.add("SEQ", record.getReadString()); - readRoot.add("QUAL", record.getBaseQualityString()); - - for ( SAMRecord.SAMTagAndValue xt : record.getAttributes() ) { - readRoot.add(xt.tag, xt.value); - } - - // add record to root - if ( ! root.hasElement(name) ) - // protect ourselves from malformed files - root.add(readRoot); - count += readRoot.size(); - if ( count > maxElementsToRead && maxElementsToRead != -1) - break; - } - - reader.close(); - - return root.getBinding(); - } - - @Override - public boolean canRead(File file) { - final byte[] BAM_MAGIC = "BAM\1".getBytes(); - final byte[] buffer = new byte[BAM_MAGIC.length]; - try { - InputStream fstream = new BufferedInputStream(new FileInputStream(file)); - if ( !BlockCompressedInputStream.isValidFile(fstream) ) - return false; - final BlockCompressedInputStream BCIS = new BlockCompressedInputStream(fstream); - BCIS.read(buffer, 0, BAM_MAGIC.length); - BCIS.close(); - return Arrays.equals(buffer, BAM_MAGIC); - } catch ( IOException e ) { - return false; - } catch ( htsjdk.samtools.FileTruncatedException e ) { - return false; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffElement.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffElement.java deleted file mode 100644 index ebed91470..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffElement.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:55 PM - * - * An interface that must be implemented to allow us to calculate differences - * between structured objects - */ -@Invariant({ - "name != null", - "value != null", - "parent != null || name.equals(\"ROOT\")", - "value == null || value.getBinding() == this"}) -public class DiffElement { - public final static DiffElement ROOT = new DiffElement(); - - final private String name; - final private DiffElement parent; - final private DiffValue value; - - /** - * For ROOT only - */ - private DiffElement() { - this.name = "ROOT"; - this.parent = null; - this.value = new DiffValue(this, "ROOT"); - } - - @Requires({"name != null", "parent != null", "value != null"}) - public DiffElement(String name, DiffElement parent, DiffValue value) { - if ( name.equals("ROOT") ) throw new IllegalArgumentException("Cannot use reserved name ROOT"); - this.name = name; - this.parent = parent; - this.value = value; - this.value.setBinding(this); - } - - @Ensures({"result != null"}) - public String getName() { - return name; - } - - public DiffElement getParent() { - return parent; - } - - @Ensures({"result != null"}) - public DiffValue getValue() { - return value; - } - - public boolean isRoot() { return this == ROOT; } - - @Ensures({"result != null"}) - @Override - public String toString() { - return getName() + "=" + getValue().toString(); - } - - public String toString(int offset) { - return (offset > 0 ? Utils.dupString(' ', offset) : 0) + getName() + "=" + getValue().toString(offset); - } - - @Ensures({"result != null"}) - public final String fullyQualifiedName() { - if ( isRoot() ) - return ""; - else if ( parent.isRoot() ) - return name; - else - return parent.fullyQualifiedName() + "." + name; - } - - @Ensures({"result != null"}) - public String toOneLineString() { - return getName() + "=" + getValue().toOneLineString(); - } - - @Ensures({"result != null"}) - public DiffNode getValueAsNode() { - if ( getValue().isCompound() ) - return (DiffNode)getValue(); - else - throw new ReviewedGATKException("Illegal request conversion of a DiffValue into a DiffNode: " + this); - } - - public int size() { - return 1 + getValue().size(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngine.java deleted file mode 100644 index d10cfea8a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffEngine.java +++ /dev/null @@ -1,437 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.classloader.PluginManager; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.PrintStream; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:51 PM - * A generic engine for comparing tree-structured objects - * - */ -public class DiffEngine { - final protected static Logger logger = Logger.getLogger(DiffEngine.class); - - private final Map readers = new HashMap(); - - public DiffEngine() { - loadDiffableReaders(); - } - - // -------------------------------------------------------------------------------- - // - // difference calculation - // - // -------------------------------------------------------------------------------- - - public List diff(DiffElement master, DiffElement test) { - DiffValue masterValue = master.getValue(); - DiffValue testValue = test.getValue(); - - if ( masterValue.isCompound() && masterValue.isCompound() ) { - return diff(master.getValueAsNode(), test.getValueAsNode()); - } else if ( masterValue.isAtomic() && testValue.isAtomic() ) { - return diff(masterValue, testValue); - } else { - // structural difference in types. one is node, other is leaf - return Arrays.asList(new Difference(master, test)); - } - } - - public List diff(DiffNode master, DiffNode test) { - Set allNames = new HashSet(master.getElementNames()); - allNames.addAll(test.getElementNames()); - List diffs = new ArrayList(); - - for ( String name : allNames ) { - DiffElement masterElt = master.getElement(name); - DiffElement testElt = test.getElement(name); - if ( masterElt == null && testElt == null ) { - throw new ReviewedGATKException("BUG: unexpectedly got two null elements for field: " + name); - } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value - // todo -- should one of these be a special MISSING item? - diffs.add(new Difference(masterElt, testElt)); - } else { - diffs.addAll(diff(masterElt, testElt)); - } - } - - return diffs; - } - - public List diff(DiffValue master, DiffValue test) { - if ( master.getValue().equals(test.getValue()) ) { - return Collections.emptyList(); - } else { - return Arrays.asList(new Difference(master.getBinding(), test.getBinding())); - } - } - - // -------------------------------------------------------------------------------- - // - // Summarizing differences - // - // -------------------------------------------------------------------------------- - - /** - * Emits a summary of the diffs to out. Suppose you have the following three differences: - * - * A.X.Z:1!=2 - * A.Y.Z:3!=4 - * B.X.Z:5!=6 - * - * The above is the itemized list of the differences. The summary looks for common differences - * in the name hierarchy, counts those shared elements, and emits the differences that occur - * in order of decreasing counts. - * - * So, in the above example, what are the shared elements? - * - * A.X.Z and B.X.Z share X.Z, so there's a *.X.Z with count 2 - * A.X.Z, A.Y.Z, and B.X.Z all share *.*.Z, with count 3 - * Each of A.X.Z, A.Y.Z, and B.X.Z are individually unique, with count 1 - * - * So we would emit the following summary: - * - * *.*.Z: 3 - * *.X.Z: 2 - * A.X.Z: 1 [specific difference: 1!=2] - * A.Y.Z: 1 [specific difference: 3!=4] - * B.X.Z: 1 [specific difference: 5!=6] - * - * The algorithm to accomplish this calculation is relatively simple. Start with all of the - * concrete differences. For each pair of differences A1.A2....AN and B1.B2....BN: - * - * find the longest common subsequence Si.Si+1...SN where Ai = Bi = Si - * If i == 0, then there's no shared substructure - * If i > 0, then generate the summarized value X = *.*...Si.Si+1...SN - * if X is a known summary, increment it's count, otherwise set its count to 1 - * - * Not that only pairs of the same length are considered as potentially equivalent - * - * @param params determines how we display the items - * @param diffs the list of differences to summarize - */ - public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { - printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params ); - } - - final protected static String[] diffNameToPath(String diffName) { - return diffName.split("\\."); - } - - protected List summarizedDifferencesOfPathsFromString(List singletonDiffs) { - List diffs = new ArrayList(); - - for ( String diff : singletonDiffs ) { - diffs.add(new Difference(diff)); - } - - return summarizedDifferencesOfPaths(diffs, true, -1); - } - - /** - * Computes a minimum set of potential differences between all singleton differences - * in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm. - * - * @param singletonDiffs - * @param maxRawDiffsToSummarize - * @return - */ - private Map initialPairwiseSummaries(final List singletonDiffs, - final int maxRawDiffsToSummarize) { - Map summaries = new HashMap(); - - // create the initial set of differences - for ( int i = 0; i < singletonDiffs.size(); i++ ) { - for ( int j = 0; j <= i; j++ ) { - Difference diffPath1 = singletonDiffs.get(i); - Difference diffPath2 = singletonDiffs.get(j); - if ( diffPath1.length() == diffPath2.length() ) { - int lcp = longestCommonPostfix(diffPath1.getParts(), diffPath2.getParts()); - String path = diffPath2.getPath(); - if ( lcp != 0 && lcp != diffPath1.length() ) - path = summarizedPath(diffPath2.getParts(), lcp); - Difference sumDiff = new Difference(path, diffPath2.getMaster(), diffPath2.getTest()); - sumDiff.setCount(0); - addSummaryIfMissing(summaries, sumDiff); - - if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) - return summaries; - } - } - } - - return summaries; - } - - /** - * Computes the possible leaf differences among the singleton diffs. - * - * The leaf differences are all of the form *.*...*.X where all internal - * differences are wildcards and the only summarized difference considered - * interesting to compute is - * - * @param singletonDiffs - * @param maxRawDiffsToSummarize - * @return - */ - private Map initialLeafSummaries(final List singletonDiffs, - final int maxRawDiffsToSummarize) { - Map summaries = new HashMap(); - - // create the initial set of differences - for ( final Difference d : singletonDiffs ) { - final String path = summarizedPath(d.getParts(), 1); - Difference sumDiff = new Difference(path, d.getMaster(), d.getTest()); - sumDiff.setCount(0); - addSummaryIfMissing(summaries, sumDiff); - - if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) - return summaries; - } - - return summaries; - } - - protected List summarizedDifferencesOfPaths(final List singletonDiffs, - final boolean doPairwise, - final int maxRawDiffsToSummarize) { - final Map summaries = doPairwise - ? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize) - : initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize); - - // count differences - for ( Difference diffPath : singletonDiffs ) { - for ( Difference sumDiff : summaries.values() ) { - if ( sumDiff.matches(diffPath.getParts()) ) - sumDiff.incCount(); - } - } - - List sortedSummaries = new ArrayList(summaries.values()); - Collections.sort(sortedSummaries); - return sortedSummaries; - } - - protected void addSummaryIfMissing(Map summaries, Difference diff) { - if ( ! summaries.containsKey(diff.getPath()) ) { - summaries.put(diff.getPath(), diff); - } - } - - protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { - List toShow = new ArrayList(); - int count = 0, count1 = 0; - for ( Difference diff : sortedSummaries ) { - if ( diff.getCount() < params.minSumDiffToShow ) - // in order, so break as soon as the count is too low - break; - - if ( params.maxItemsToDisplay != 0 && count++ > params.maxItemsToDisplay ) - break; - - if ( diff.getCount() == 1 ) { - count1++; - if ( params.maxCountOneItems != 0 && count1 > params.maxCountOneItems ) - break; - } - - toShow.add(diff); - } - - // if we want it in descending order, reverse the list - if ( ! params.descending ) { - Collections.reverse(toShow); - } - - // now that we have a specific list of values we want to show, display them - GATKReport report = new GATKReport(); - final String tableName = "differences"; - report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gatk/guide/article?id=1299 for more information", 3); - final GATKReportTable table = report.getTable(tableName); - table.addColumn("Difference"); - table.addColumn("NumberOfOccurrences"); - table.addColumn("ExampleDifference"); - for ( final Difference diff : toShow ) { - final String key = diff.getPath(); - table.addRowID(key, true); - table.set(key, "NumberOfOccurrences", diff.getCount()); - table.set(key, "ExampleDifference", diff.valueDiffString()); - } - GATKReport output = new GATKReport(table); - output.print(params.out); - } - - protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) { - int i = 0; - for ( ; i < diffPath1.length; i++ ) { - int j = diffPath1.length - i - 1; - if ( ! diffPath1[j].equals(diffPath2[j]) ) - break; - } - return i; - } - - /** - * parts is [A B C D] - * commonPostfixLength: how many parts are shared at the end, suppose its 2 - * We want to create a string *.*.C.D - * - * @param parts the separated path values [above without .] - * @param commonPostfixLength - * @return - */ - protected static String summarizedPath(String[] parts, int commonPostfixLength) { - int stop = parts.length - commonPostfixLength; - if ( stop > 0 ) parts = parts.clone(); - for ( int i = 0; i < stop; i++ ) { - parts[i] = "*"; - } - return Utils.join(".", parts); - } - - // -------------------------------------------------------------------------------- - // - // plugin manager - // - // -------------------------------------------------------------------------------- - - public void loadDiffableReaders() { - List> drClasses = new PluginManager( DiffableReader.class ).getPlugins(); - - logger.info("Loading diffable modules:"); - for (Class drClass : drClasses ) { - logger.info("\t" + drClass.getSimpleName()); - - try { - DiffableReader dr = drClass.newInstance(); - readers.put(dr.getName(), dr); - } catch (InstantiationException e) { - throw new ReviewedGATKException("Unable to instantiate module '" + drClass.getSimpleName() + "'"); - } catch (IllegalAccessException e) { - throw new ReviewedGATKException("Illegal access error when trying to instantiate '" + drClass.getSimpleName() + "'"); - } - } - } - - protected Map getReaders() { - return readers; - } - - protected DiffableReader getReader(String name) { - return readers.get(name); - } - - /** - * Returns a reader appropriate for this file, or null if no such reader exists - * @param file - * @return - */ - public DiffableReader findReaderForFile(File file) { - for ( DiffableReader reader : readers.values() ) - if (reader.canRead(file) ) - return reader; - - return null; - } - - /** - * Returns true if reader appropriate for this file, or false if no such reader exists - * @param file - * @return - */ - public boolean canRead(File file) { - return findReaderForFile(file) != null; - } - - - public DiffElement createDiffableFromFile(File file) { - return createDiffableFromFile(file, -1); - } - - public DiffElement createDiffableFromFile(File file, int maxElementsToRead) { - DiffableReader reader = findReaderForFile(file); - if ( reader == null ) - throw new UserException("Unsupported file type: " + file); - else - return reader.readFromFile(file, maxElementsToRead); - } - - public static boolean simpleDiffFiles(File masterFile, File testFile, int maxElementsToRead, DiffEngine.SummaryReportParams params) { - DiffEngine diffEngine = new DiffEngine(); - - if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { - DiffElement master = diffEngine.createDiffableFromFile(masterFile, maxElementsToRead); - DiffElement test = diffEngine.createDiffableFromFile(testFile, maxElementsToRead); - List diffs = diffEngine.diff(master, test); - diffEngine.reportSummarizedDifferences(diffs, params); - return true; - } else { - return false; - } - } - - public static class SummaryReportParams { - final PrintStream out; - final int maxItemsToDisplay; - final int maxCountOneItems; - final int minSumDiffToShow; - final int maxRawDiffsToSummarize; - final boolean doPairwise; - boolean descending = true; - - public SummaryReportParams(PrintStream out, - int maxItemsToDisplay, - int maxCountOneItems, - int minSumDiffToShow, - int maxRawDiffsToSummarize, - final boolean doPairwise) { - this.out = out; - this.maxItemsToDisplay = maxItemsToDisplay; - this.maxCountOneItems = maxCountOneItems; - this.minSumDiffToShow = minSumDiffToShow; - this.maxRawDiffsToSummarize = maxRawDiffsToSummarize; - this.doPairwise = doPairwise; - } - - public void setDescending(boolean descending) { - this.descending = descending; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNode.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNode.java deleted file mode 100644 index dde9ca50d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffNode.java +++ /dev/null @@ -1,249 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import com.google.java.contract.Requires; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:55 PM - * - * An interface that must be implemented to allow us to calculate differences - * between structured objects - */ -public class DiffNode extends DiffValue { - private Map getElementMap() { - return (Map)super.getValue(); - } - private static Map emptyElements() { return new HashMap(); } - - private DiffNode(Map elements) { - super(elements); - } - - private DiffNode(DiffElement binding, Map elements) { - super(binding, elements); - } - - // --------------------------------------------------------------------------- - // - // constructors - // - // --------------------------------------------------------------------------- - - public static DiffNode rooted(String name) { - return empty(name, DiffElement.ROOT); - } - - public static DiffNode empty(String name, DiffElement parent) { - DiffNode df = new DiffNode(emptyElements()); - DiffElement elt = new DiffElement(name, parent, df); - df.setBinding(elt); - return df; - } - - public static DiffNode empty(String name, DiffValue parent) { - return empty(name, parent.getBinding()); - } - - // --------------------------------------------------------------------------- - // - // accessors - // - // --------------------------------------------------------------------------- - - @Override - public boolean isAtomic() { return false; } - - public Collection getElementNames() { - return getElementMap().keySet(); - } - - public Collection getElements() { - return getElementMap().values(); - } - - private Collection getElements(boolean atomicOnly) { - List elts = new ArrayList(); - for ( DiffElement elt : getElements() ) - if ( (atomicOnly && elt.getValue().isAtomic()) || (! atomicOnly && elt.getValue().isCompound())) - elts.add(elt); - return elts; - } - - public Collection getAtomicElements() { - return getElements(true); - } - - public Collection getCompoundElements() { - return getElements(false); - } - - /** - * Returns the element bound to name, or null if no such binding exists - * @param name - * @return - */ - public DiffElement getElement(String name) { - return getElementMap().get(name); - } - - /** - * Returns true if name is bound in this node - * @param name - * @return - */ - public boolean hasElement(String name) { - return getElement(name) != null; - } - - // --------------------------------------------------------------------------- - // - // add - // - // --------------------------------------------------------------------------- - - @Requires("elt != null") - public void add(DiffElement elt) { - if ( getElementMap().containsKey(elt.getName()) ) - throw new IllegalArgumentException("Attempting to rebind already existing binding: " + elt + " node=" + this); - getElementMap().put(elt.getName(), elt); - } - - @Requires("elt != null") - public void add(DiffValue elt) { - add(elt.getBinding()); - } - - @Requires("elts != null") - public void add(Collection elts) { - for ( DiffElement e : elts ) - add(e); - } - - public void add(String name, Object value) { - add(new DiffElement(name, this.getBinding(), new DiffValue(value))); - } - - public int size() { - int count = 0; - for ( DiffElement value : getElements() ) - count += value.size(); - return count; - } - - // --------------------------------------------------------------------------- - // - // toString - // - // --------------------------------------------------------------------------- - - @Override - public String toString() { - return toString(0); - } - - @Override - public String toString(int offset) { - String off = offset > 0 ? Utils.dupString(' ', offset) : ""; - StringBuilder b = new StringBuilder(); - - b.append("(").append("\n"); - Collection atomicElts = getAtomicElements(); - for ( DiffElement elt : atomicElts ) { - b.append(elt.toString(offset + 2)).append('\n'); - } - - for ( DiffElement elt : getCompoundElements() ) { - b.append(elt.toString(offset + 4)).append('\n'); - } - b.append(off).append(")").append("\n"); - - return b.toString(); - } - - @Override - public String toOneLineString() { - StringBuilder b = new StringBuilder(); - - b.append('('); - List parts = new ArrayList(); - for ( DiffElement elt : getElements() ) - parts.add(elt.toOneLineString()); - b.append(Utils.join(" ", parts)); - b.append(')'); - - return b.toString(); - } - - // -------------------------------------------------------------------------------- - // - // fromString and toOneLineString - // - // -------------------------------------------------------------------------------- - - public static DiffElement fromString(String tree) { - return fromString(tree, DiffElement.ROOT); - } - - /** - * Doesn't support full tree structure parsing - * @param tree - * @param parent - * @return - */ - private static DiffElement fromString(String tree, DiffElement parent) { - // X=(A=A B=B C=(D=D)) - String[] parts = tree.split("=", 2); - if ( parts.length != 2 ) - throw new ReviewedGATKException("Unexpected tree structure: " + tree); - String name = parts[0]; - String value = parts[1]; - - if ( value.length() == 0 ) - throw new ReviewedGATKException("Illegal tree structure: " + value + " at " + tree); - - if ( value.charAt(0) == '(' ) { - if ( ! value.endsWith(")") ) - throw new ReviewedGATKException("Illegal tree structure. Missing ): " + value + " at " + tree); - String subtree = value.substring(1, value.length()-1); - DiffNode rec = DiffNode.empty(name, parent); - String[] subParts = subtree.split(" "); - for ( String subPart : subParts ) { - rec.add(fromString(subPart, rec.getBinding())); - } - return rec.getBinding(); - } else { - return new DiffValue(name, parent, value).getBinding(); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjects.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjects.java deleted file mode 100644 index c622e24f1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffObjects.java +++ /dev/null @@ -1,276 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.Input; -import org.broadinstitute.gatk.utils.commandline.Output; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; -import org.broadinstitute.gatk.utils.help.HelpConstants; - -import java.io.File; -import java.io.PrintStream; -import java.util.List; - -/** - * A generic engine for comparing tree-structured objects - * - *

- * Compares two record-oriented files, itemizing specific difference between equivalent - * records in the two files. Reports both itemized and summarized differences. - *

- * - *

What are the summarized differences and the DiffObjectsWalker?

- * - *

- * The GATK contains a summarizing difference engine that compares hierarchical data structures to emit: - *

    - *
  • A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G. - *
  • A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed. - *
- *

- * - *

- * The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line. Simply provide the walker with the master and test files and it will emit summarized differences for you. - *

- * - *

Why?

- * - *

- * The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them. This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others. - *

- * - *

Input

- *

- * The DiffObjectsWalker works with BAM or VCF files. - *

- * - *

Output

- *

- * The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named - * nodes. Suppose I have two trees: - *

- *     Tree1=(A=1 B=(C=2 D=3))
- *     Tree2=(A=1 B=(C=3 D=3 E=4))
- *     Tree3=(A=1 B=(C=4 D=3 E=4))
- * 
- *

- * where every node in the tree is named, or is a raw value (here all leaf values are integers). The DiffEngine - * traverses these data structures by name, identifies equivalent nodes by fully qualified names - * (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are). - * These itemized differences are listed as: - *

- *     Tree1.B.C=2 != Tree2.B.C=3
- *     Tree1.B.C=2 != Tree3.B.C=4
- *     Tree2.B.C=3 != Tree3.B.C=4
- *     Tree1.B.E=MISSING != Tree2.B.E=4
- * 
- * - *

- * This conceptually very similar to the output of the unix command line tool diff. What's nice about DiffEngine though - * is that it computes similarity among the itemized differences and displays the count of differences names - * in the system. In the above example, the field C is not equal three times, while the missing E in Tree1 occurs - * only once. So the summary is: - * - *

- *     *.B.C : 3
- *     *.B.E : 1
- * 
- * - *

- * where the * operator indicates that any named field matches. This output is sorted by counts, and provides an - * immediate picture of the commonly occurring differences among the files. - *

- * Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines, - * detected by the integrationtest integration (more below). You can see that in the although there are many specific - * instances of these differences between the two files, the summarized differences provide an immediate picture that - * the AC, AF, and AN fields are the major causes of the differences. - *

- * - *

- [testng] path                                                             count
- [testng] *.*.*.AC                                                         6
- [testng] *.*.*.AF                                                         6
- [testng] *.*.*.AN                                                         6
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN  1
- [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC  1
- 
- * - *

Caveat

- *

Because this is a walker, it requires that you pass a reference file. However the reference is not actually used, so it does not matter what you pass as reference.

- * - * - * @author Mark DePristo - * @since 7/4/11 - */ -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) -public class DiffObjects extends RodWalker { - /** - * Writes out a file of the DiffEngine format: - * - * See http://www.broadinstitute.org/gatk/guide/article?id=1299 for details. - */ - @Output(doc="File to which results should be written") - protected PrintStream out; - - /** - * The master file against which we will compare test. This is one of the two required - * files to do the comparison. Conceptually master is the original file contained the expected - * results, but this doesn't currently have an impact on the calculations, but might in the future. - */ - @Input(fullName="master", shortName="m", doc="Master file: expected results", required=true) - File masterFile; - - /** - * The test file against which we will compare to the master. This is one of the two required - * files to do the comparison. Conceptually test is the derived file from master, but this - * doesn't currently have an impact on the calculations, but might in the future. - */ - @Input(fullName="test", shortName="t", doc="Test file: new results to compare to the master file", required=true) - File testFile; - - /** - * The engine will read at most this number of objects from each of master and test files. This reduces - * the memory requirements for DiffObjects but does limit you to comparing at most this number of objects - */ - @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) - int MAX_OBJECTS_TO_READ = -1; - - @Argument(fullName="maxRawDiffsToSummarize", shortName="maxRawDiffsToSummarize", doc="Max. number of differences to include in the summary. -1 [default] means unlimited", required=false) - int maxRawDiffsToSummary = -1; - - @Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false) - boolean doPairwise = false; - - /** - * The max number of differences to display when summarizing. For example, if there are 10M differences, but - * maxDiffs is 10, then the comparison aborts after first ten summarized differences are shown. Note that - * the system shows differences sorted by frequency, so these 10 would be the most common between the two files. - * A value of 0 means show all possible differences. - */ - @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) - int MAX_DIFFS = 0; - - /** - * The maximum number of singleton (occurs exactly once between the two files) to display when writing out - * the summary. Only applies if maxDiffs hasn't been exceeded. For example, if maxDiffs is 10 and maxCount1Diffs - * is 2 and there are 20 diffs with count > 1, then only 10 are shown, all of which have count above 1. - */ - @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) - int MAX_COUNT1_DIFFS = 0; - - /** - * Only differences that occur more than minCountForDiff are displayed. For example, if minCountForDiff is 10, then - * a difference must occur at least 10 times between the two files to be shown. - */ - @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) - int minCountForDiff = 1; - - /** - * If provided, the system will write out the summarized, individual differences. May lead to enormous outputs, - * depending on how many differences are found. Note these are not sorted in any way, so if you have 10M - * common differences in the files, you will see 10M records, whereas the final summarize will just list the - * difference and its count of 10M. - */ - @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) - boolean showItemizedDifferences = false; - - @Argument(fullName="iterations", doc="Number of iterations to perform, should be 1 unless you are doing memory testing", required=false) - int iterations = 1; - - DiffEngine diffEngine; - - @Override - public void initialize() { - this.diffEngine = new DiffEngine(); - } - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - @Override - public void onTraversalDone(Integer sum) { - if ( iterations > 1 ) { - for ( int i = 0; i < iterations; i++ ) { - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, 20, 10, 0, -1, false); - boolean success = DiffEngine.simpleDiffFiles(masterFile, testFile, MAX_OBJECTS_TO_READ, params); - logger.info("Iteration " + i + " success " + success); - } - } else { - //out.printf("Reading master file %s%n", masterFile); - DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ); - logger.info(String.format("Read %d objects", master.size())); - //out.printf("Reading test file %s%n", testFile); - DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ); - logger.info(String.format("Read %d objects", test.size())); - -// out.printf("Master diff objects%n"); -// out.println(master.toString()); -// out.printf("Test diff objects%n"); -// out.println(test.toString()); - - List diffs = diffEngine.diff(master, test); - logger.info(String.format("Done computing diff with %d differences found", diffs.size())); - if ( showItemizedDifferences ) { - out.printf("Itemized results%n"); - for ( Difference diff : diffs ) - out.printf("DIFF: %s%n", diff.toString()); - } - - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, - MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, - maxRawDiffsToSummary, doPairwise); - params.setDescending(false); - diffEngine.reportSummarizedDifferences(diffs, params); - logger.info(String.format("Done summarizing differences")); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffValue.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffValue.java deleted file mode 100644 index acec38356..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffValue.java +++ /dev/null @@ -1,90 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 12:55 PM - * - * An interface that must be implemented to allow us to calculate differences - * between structured objects - */ -public class DiffValue { - private DiffElement binding = null; - final private Object value; - - public DiffValue(Object value) { - this.value = value; - } - - public DiffValue(DiffElement binding, Object value) { - this.binding = binding; - this.value = value; - } - - public DiffValue(DiffValue parent, Object value) { - this(parent.getBinding(), value); - } - - public DiffValue(String name, DiffElement parent, Object value) { - this.binding = new DiffElement(name, parent, this); - this.value = value; - } - - public DiffValue(String name, DiffValue parent, Object value) { - this(name, parent.getBinding(), value); - } - - public DiffElement getBinding() { - return binding; - } - - protected void setBinding(DiffElement binding) { - this.binding = binding; - } - - public Object getValue() { - return value; - } - - public String toString() { - return getValue().toString(); - } - - public String toString(int offset) { - return toString(); - } - - public String toOneLineString() { - return getValue().toString(); - } - - public boolean isAtomic() { return true; } - public boolean isCompound() { return ! isAtomic(); } - public int size() { return 1; } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReader.java deleted file mode 100644 index 903a073e0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/DiffableReader.java +++ /dev/null @@ -1,66 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.io.File; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 1:09 PM - * - * Interface for readers creating diffable objects from a file - */ -public interface DiffableReader { - @Ensures("result != null") - /** - * Return the name of this DiffableReader type. For example, the VCF reader returns 'VCF' and the - * bam reader 'BAM' - */ - public String getName(); - - @Ensures("result != null") - @Requires("file != null") - /** - * Read up to maxElementsToRead DiffElements from file, and return them. - */ - public DiffElement readFromFile(File file, int maxElementsToRead); - - /** - * Return true if the file can be read into DiffElement objects with this reader. This should - * be uniquely true/false for all readers, as the system will use the first reader that can read the - * file. This routine should never throw an exception. The VCF reader, for example, looks at the - * first line of the file for the ##format=VCF4.1 header, and the BAM reader for the BAM_MAGIC value - * @param file - * @return - */ - @Requires("file != null") - public boolean canRead(File file); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/Difference.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/Difference.java deleted file mode 100644 index c8794a703..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/Difference.java +++ /dev/null @@ -1,137 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -public class Difference implements Comparable { - final String path; // X.Y.Z - final String[] parts; - int count = 1; - DiffElement master = null , test = null; - - public Difference(String path) { - this.path = path; - this.parts = DiffEngine.diffNameToPath(path); - } - - public Difference(DiffElement master, DiffElement test) { - this(createPath(master, test), master, test); - } - - public Difference(String path, DiffElement master, DiffElement test) { - this(path); - this.master = master; - this.test = test; - } - - public String[] getParts() { - return parts; - } - - public void incCount() { count++; } - - public int getCount() { - return count; - } - - public void setCount(int count) { - this.count = count; - } - - /** - * The fully qualified path object A.B.C etc - * @return - */ - public String getPath() { - return path; - } - - /** - * @return the length of the parts of this summary - */ - public int length() { - return this.parts.length; - } - - /** - * Returns true if the string parts matches this summary. Matches are - * must be equal() everywhere where this summary isn't *. - * @param otherParts - * @return - */ - public boolean matches(String[] otherParts) { - if ( otherParts.length != length() ) - return false; - - // TODO optimization: can start at right most non-star element - for ( int i = 0; i < length(); i++ ) { - String part = parts[i]; - if ( ! part.equals("*") && ! part.equals(otherParts[i]) ) - return false; - } - - return true; - } - - @Override - public String toString() { - return String.format("%s:%d:%s", getPath(), getCount(), valueDiffString()); - } - - @Override - public int compareTo(Difference other) { - // sort first highest to lowest count, then by lowest to highest path - int countCmp = Integer.valueOf(count).compareTo(other.count); - return countCmp != 0 ? -1 * countCmp : path.compareTo(other.path); - } - - public String valueDiffString() { - if ( hasSpecificDifference() ) { - return String.format("%s!=%s", getOneLineString(master), getOneLineString(test)); - } else { - return "N/A"; - } - } - - private static String createPath(DiffElement master, DiffElement test) { - return (master == null ? test : master).fullyQualifiedName(); - } - - private static String getOneLineString(DiffElement elt) { - return elt == null ? "MISSING" : elt.getValue().toOneLineString(); - } - - public boolean hasSpecificDifference() { - return master != null || test != null; - } - - public DiffElement getMaster() { - return master; - } - - public DiffElement getTest() { - return test; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/GATKReportDiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/GATKReportDiffableReader.java deleted file mode 100644 index 4a78448b6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/GATKReportDiffableReader.java +++ /dev/null @@ -1,104 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportColumn; -import org.broadinstitute.gatk.engine.report.GATKReportTable; - -import java.io.File; -import java.io.FileReader; -import java.io.IOException; - - -/** - * Class implementing diffnode reader for GATKReports - */ - -// TODO Version check to be added at the report level - -public class GATKReportDiffableReader implements DiffableReader { - @Override - public String getName() { - return "GATKReport"; - } - - @Override - public DiffElement readFromFile(File file, int maxElementsToRead) { - DiffNode root = DiffNode.rooted(file.getName()); - try { - // one line reads the whole thing into memory - GATKReport report = new GATKReport(file); - - for (GATKReportTable table : report.getTables()) { - root.add(tableToNode(table, root)); - } - - return root.getBinding(); - } catch (Exception e) { - return null; - } - } - - private DiffNode tableToNode(GATKReportTable table, DiffNode root) { - DiffNode tableRoot = DiffNode.empty(table.getTableName(), root); - - tableRoot.add("Description", table.getTableDescription()); - tableRoot.add("NumberOfRows", table.getNumRows()); - - for ( GATKReportColumn column : table.getColumnInfo() ) { - DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot); - - columnRoot.add("Width", column.getColumnFormat().getWidth()); - // NOTE: as the values are trimmed during parsing left/right alignment is not currently preserved - columnRoot.add("Displayable", true); - - for ( int i = 0; i < table.getNumRows(); i++ ) { - String name = column.getColumnName() + (i+1); - columnRoot.add(name, table.get(i, column.getColumnName()).toString()); - } - - tableRoot.add(columnRoot); - } - - return tableRoot; - } - - @Override - public boolean canRead(File file) { - try { - final String HEADER = GATKReport.GATKREPORT_HEADER_PREFIX; - final char[] buff = new char[HEADER.length()]; - final FileReader FR = new FileReader(file); - FR.read(buff, 0, HEADER.length()); - FR.close(); - String firstLine = new String(buff); - return firstLine.startsWith(HEADER); - } catch (IOException e) { - return false; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/VCFDiffableReader.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/VCFDiffableReader.java deleted file mode 100644 index 23b213e91..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/engine/walkers/diffengine/VCFDiffableReader.java +++ /dev/null @@ -1,145 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers.diffengine; - -import org.apache.log4j.Logger; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.FeatureReader; -import org.broadinstitute.gatk.utils.Utils; -import htsjdk.variant.vcf.*; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.VariantContext; - -import java.io.*; -import java.util.Iterator; -import java.util.Map; - - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/4/11 - * Time: 1:09 PM - * - * Class implementing diffnode reader for VCF - */ -public class VCFDiffableReader implements DiffableReader { - private static Logger logger = Logger.getLogger(VCFDiffableReader.class); - - @Override - public String getName() { return "VCF"; } - - @Override - public DiffElement readFromFile(File file, int maxElementsToRead) { - DiffNode root = DiffNode.rooted(file.getName()); - try { - // read the version line from the file - BufferedReader br = new BufferedReader(new FileReader(file)); - final String version = br.readLine(); - root.add("VERSION", version); - br.close(); - - final VCFCodec vcfCodec = new VCFCodec(); - vcfCodec.disableOnTheFlyModifications(); // must be read as state is stored in reader itself - - FeatureReader reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false); - VCFHeader header = (VCFHeader)reader.getHeader(); - for ( VCFHeaderLine headerLine : header.getMetaDataInInputOrder() ) { - String key = headerLine.getKey(); - if ( headerLine instanceof VCFIDHeaderLine) - key += "_" + ((VCFIDHeaderLine) headerLine).getID(); - if ( root.hasElement(key) ) - logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString()); - else - root.add(key, headerLine.toString()); - } - - int count = 0, nRecordsAtPos = 1; - String prevName = ""; - Iterator it = reader.iterator(); - while ( it.hasNext() ) { - VariantContext vc = it.next(); - String name = vc.getChr() + ":" + vc.getStart(); - if ( name.equals(prevName) ) { - name += "_" + ++nRecordsAtPos; - } else { - prevName = name; - } - DiffNode vcRoot = DiffNode.empty(name, root); - - // add fields - vcRoot.add("CHROM", vc.getChr()); - vcRoot.add("POS", vc.getStart()); - vcRoot.add("ID", vc.getID()); - vcRoot.add("REF", vc.getReference()); - vcRoot.add("ALT", vc.getAlternateAlleles()); - vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4); - vcRoot.add("FILTER", ! vc.filtersWereApplied() // needs null to differentiate between PASS and . - ? VCFConstants.MISSING_VALUE_v4 - : ( vc.getFilters().isEmpty() ? VCFConstants.PASSES_FILTERS_v4 : vc.getFilters()) ); - - // add info fields - for (Map.Entry attribute : vc.getAttributes().entrySet()) { - if ( ! attribute.getKey().startsWith("_") ) - vcRoot.add(attribute.getKey(), attribute.getValue()); - } - - for (Genotype g : vc.getGenotypes() ) { - DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); - gRoot.add("GT", g.getGenotypeString()); - if ( g.hasGQ() ) gRoot.add("GQ", g.getGQ() ); - if ( g.hasDP() ) gRoot.add("DP", g.getDP() ); - if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD())); - if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL())); - if ( g.getFilters() != null ) gRoot.add("FT", g.getFilters()); - - for (Map.Entry attribute : g.getExtendedAttributes().entrySet()) { - if ( ! attribute.getKey().startsWith("_") ) - gRoot.add(attribute.getKey(), attribute.getValue()); - } - - vcRoot.add(gRoot); - } - - root.add(vcRoot); - count += vcRoot.size(); - if ( count > maxElementsToRead && maxElementsToRead != -1) - break; - } - - reader.close(); - } catch ( IOException e ) { - return null; - } - - return root.getBinding(); - } - - @Override - public boolean canRead(File file) { - return AbstractVCFCodec.canDecodeFile(file.getPath(), VCFCodec.VCF4_MAGIC_HEADER); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java index 43403ab79..99ce0af6d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/CatVariants.java @@ -40,7 +40,7 @@ import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.text.XReadLines; import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.bcf2.BCF2Codec; import org.broadinstitute.gatk.utils.collections.Pair; import htsjdk.variant.vcf.VCFCodec; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java index 72d30defd..4bc91d6e3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/ListAnnotations.java @@ -27,9 +27,9 @@ package org.broadinstitute.gatk.tools; import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotationHelpUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.help.HelpUtils; /** * Utility program to print a list of available annotations @@ -66,7 +66,7 @@ public class ListAnnotations extends CommandLineProgram { @Override protected int execute() throws Exception { - HelpUtils.listAnnotations(); + AnnotationHelpUtils.listAnnotations(); return 0; } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java index 9127b5ee2..20240897c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalance.java @@ -32,9 +32,9 @@ import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.MathUtils; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java index 9f5ee9c55..c5670068d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AlleleBalanceBySample.java @@ -32,9 +32,9 @@ import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineType; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java index 44579f9ba..0fefa4350 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/BaseCounts.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCountConstants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCountConstants.java deleted file mode 100644 index 67fc0a406..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCountConstants.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.annotator; - -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFStandardHeaderLines; - - -/** - * Keys and descriptions for the common chromosome count annotations - */ -public class ChromosomeCountConstants { - - public static final String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; - - public static final VCFInfoHeaderLine[] descriptions = { - VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY), - VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY), - VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY) }; -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java index 05054a3f3..c76b2917d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/LowMQ.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java index cf1323fe6..2fdfdfe6d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/MappingQualityZeroBySample.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java index 544feb784..9cb6eeeaa 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/NBaseCount.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java index f514f67f0..e41f9d4b5 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/SnpEff.java @@ -28,15 +28,15 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.RodRequiringAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java index 042ba487f..c0371647a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotator.java @@ -30,17 +30,17 @@ import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContextUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContextUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotationHelpUtils; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.help.HelpUtils; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.VariantContext; @@ -206,7 +206,7 @@ public class VariantAnnotator extends RodWalker implements Ann public void initialize() { if ( LIST ) { - HelpUtils.listAnnotations(); + AnnotationHelpUtils.listAnnotations(); System.exit(0); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index 60c882417..dac0d7174 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -30,9 +30,9 @@ import com.google.java.contract.Requires; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.commandline.RodBinding; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotator.java index 03f707f9d..e7e0b5431 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantOverlapAnnotator.java @@ -26,7 +26,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.GenomeLocParser; import htsjdk.variant.variantcontext.Allele; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/AnnotationHelpUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/AnnotationHelpUtils.java new file mode 100644 index 000000000..d8e3c8c3c --- /dev/null +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/AnnotationHelpUtils.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator.interfaces; + +import org.broadinstitute.gatk.utils.classloader.PluginManager; + +import java.util.List; + +public class AnnotationHelpUtils { + + /** + * Simple method to print a list of available annotations. + */ + public static void listAnnotations() { + System.out.println("\nThis is a list of available Variant Annotations for use with tools such as UnifiedGenotyper, HaplotypeCaller and VariantAnnotator. Please see the Technical Documentation for more details about these annotations:"); + System.out.println("http://www.broadinstitute.org/gatk/tooldocs/"); + System.out.println("\nStandard annotations in the list below are marked with a '*'."); + List> infoAnnotationClasses = new PluginManager(InfoFieldAnnotation.class).getPlugins(); + System.out.println("\nAvailable annotations for the VCF INFO field:"); + for (int i = 0; i < infoAnnotationClasses.size(); i++) + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName()); + System.out.println(); + List> genotypeAnnotationClasses = new PluginManager(GenotypeAnnotation.class).getPlugins(); + System.out.println("\nAvailable annotations for the VCF FORMAT field:"); + for (int i = 0; i < genotypeAnnotationClasses.size(); i++) + System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName()); + System.out.println(); + System.out.println("\nAvailable classes/groups of annotations:"); + for ( Class c : new PluginManager(AnnotationType.class).getInterfaces() ) + System.out.println("\t" + c.getSimpleName()); + System.out.println(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/GenotypeAnnotation.java index a6a81d758..41c2650ca 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/GenotypeAnnotation.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/GenotypeAnnotation.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator.interfaces; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import htsjdk.variant.vcf.VCFFormatHeaderLine; import htsjdk.variant.variantcontext.Genotype; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/InfoFieldAnnotation.java index 55a30d8be..571055524 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/InfoFieldAnnotation.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/InfoFieldAnnotation.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.annotator.interfaces; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleOutputToVCF.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleOutputToVCF.java index 726ea9ba4..1c59dc51d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleOutputToVCF.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/BeagleOutputToVCF.java @@ -28,15 +28,15 @@ package org.broadinstitute.gatk.tools.walkers.beagle; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.codecs.beagle.BeagleFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.*; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/ProduceBeagleInput.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/ProduceBeagleInput.java index dab5d160e..7e08defd4 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/ProduceBeagleInput.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/ProduceBeagleInput.java @@ -25,21 +25,21 @@ package org.broadinstitute.gatk.tools.walkers.beagle; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Gender; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.tools.walkers.variantrecalibration.VQSRCalibrationCurve; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.VCFFilterHeaderLine; import htsjdk.variant.vcf.VCFHeader; @@ -278,7 +278,7 @@ public class ProduceBeagleInput extends RodWalker { log10Likelihoods = genotype.getLikelihoods().getAsVector(); // see if we need to randomly mask out genotype in this position. - if ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() <= insertedNoCallRate ) { + if ( Utils.getRandomGenerator().nextDouble() <= insertedNoCallRate ) { // we are masking out this genotype log10Likelihoods = isMaleOnChrX ? HAPLOID_FLAT_LOG10_LIKELIHOODS : DIPLOID_FLAT_LOG10_LIKELIHOODS; } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/VariantsToBeagleUnphased.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/VariantsToBeagleUnphased.java index c45ceb269..939a52ad0 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/VariantsToBeagleUnphased.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/beagle/VariantsToBeagleUnphased.java @@ -30,17 +30,17 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.Allele; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java index 1757e7bc0..332486b1a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CallableLoci.java @@ -29,9 +29,9 @@ import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.By; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.LocusWalker; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CompareCallableLoci.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CompareCallableLoci.java index 9ab8555cd..0c94747b3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CompareCallableLoci.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CompareCallableLoci.java @@ -31,9 +31,9 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java index 7514fa5a8..e71860794 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/CoverageUtils.java @@ -27,7 +27,7 @@ package org.broadinstitute.gatk.tools.walkers.coverage; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java index 3fc2e597a..20c480901 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/DepthOfCoverage.java @@ -27,23 +27,20 @@ package org.broadinstitute.gatk.tools.walkers.coverage; import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.commandline.Advanced; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.SeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrack; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.SeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrack; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; import org.broadinstitute.gatk.utils.codecs.refseq.RefSeqCodec; import org.broadinstitute.gatk.utils.codecs.refseq.RefSeqFeature; import org.broadinstitute.gatk.utils.collections.Pair; @@ -51,6 +48,7 @@ import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; +import org.broadinstitute.gatk.utils.sam.ReadUtils; import java.io.File; import java.io.PrintStream; @@ -336,7 +334,7 @@ public class DepthOfCoverage extends LocusWalker getSamplesFromToolKit(DoCOutputType.Partition type) { HashSet partition = new HashSet(); if ( type == DoCOutputType.Partition.sample ) { - partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); + partition.addAll(ReadUtils.getSAMFileSamples(getToolkit().getSAMFileHeader())); } else if ( type == DoCOutputType.Partition.readgroup ) { for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/GCContentByInterval.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/GCContentByInterval.java index a23cfe36b..a793ed07d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/GCContentByInterval.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/coverage/GCContentByInterval.java @@ -28,9 +28,9 @@ package org.broadinstitute.gatk.tools.walkers.coverage; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.collections.Pair; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java index 370cea281..e596cdd70 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/CoveredByNSamplesSites.java @@ -35,9 +35,9 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java index 910afa4d6..b0b18abc6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ErrorRatePerCycle.java @@ -28,11 +28,11 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.QualityUtils; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java index e9856de0d..ba4542768 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadGroupProperties.java @@ -29,10 +29,10 @@ import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.utils.Median; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java index a632f252e..438b38e36 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/diagnostics/ReadLengthDistribution.java @@ -28,10 +28,10 @@ package org.broadinstitute.gatk.tools.walkers.diagnostics; import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKDocsExample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKDocsExample.java index 80fca6787..e026b286b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKDocsExample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKDocsExample.java @@ -27,9 +27,9 @@ package org.broadinstitute.gatk.tools.walkers.examples; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Hidden; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; /** diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKPaperGenotyper.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKPaperGenotyper.java index 8cfc14e41..bdddc5040 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKPaperGenotyper.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/examples/GATKPaperGenotyper.java @@ -28,9 +28,9 @@ package org.broadinstitute.gatk.tools.walkers.examples; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.utils.genotyper.DiploidGenotype; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java index 8fd41524f..c6e451cd2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaAlternateReferenceMaker.java @@ -31,13 +31,13 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java index 8459506ba..562f00bf4 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaReferenceMaker.java @@ -28,9 +28,9 @@ package org.broadinstitute.gatk.tools.walkers.fasta; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RefWalker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.collections.Pair; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java index 22a592b77..e5178dd74 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/fasta/FastaStats.java @@ -27,9 +27,9 @@ package org.broadinstitute.gatk.tools.walkers.fasta; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RefWalker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/FiltrationContext.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/FiltrationContext.java index 513763b68..c5302b9a2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/FiltrationContext.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/FiltrationContext.java @@ -25,7 +25,7 @@ package org.broadinstitute.gatk.tools.walkers.filters; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java index 3988498c0..33009b5b6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/filters/VariantFiltration.java @@ -29,16 +29,16 @@ import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleList.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleList.java deleted file mode 100644 index 3b0adeda4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleList.java +++ /dev/null @@ -1,41 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.genotyper; - -import htsjdk.variant.variantcontext.Allele; - -/** - * Created by valentin on 5/12/14. - */ -public interface AlleleList { - - public int alleleCount(); - - public int alleleIndex(final A allele); - - public A alleleAt(final int index); - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListPermutation.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListPermutation.java deleted file mode 100644 index 8d95fa43e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListPermutation.java +++ /dev/null @@ -1,35 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.genotyper; - -import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.utils.collections.Permutation; - -/** - * Marks allele list permutation implementation classes. - */ -public interface AlleleListPermutation extends Permutation, AlleleList { -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtils.java deleted file mode 100644 index 4f40f51ab..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/AlleleListUtils.java +++ /dev/null @@ -1,334 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.genotyper; - -import htsjdk.variant.variantcontext.Allele; - -import java.util.AbstractList; -import java.util.List; - -/** - * Utils operations on {@link AlleleList} instances. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class AlleleListUtils { - - @SuppressWarnings("unchecked") - private static final AlleleList EMPTY_LIST = new AlleleList() { - @Override - public int alleleCount() { - return 0; - } - - @Override - public int alleleIndex(final Allele allele) { - return -1; - } - - @Override - public Allele alleleAt(final int index) { - throw new IllegalArgumentException("allele index is out of range"); - } - }; - - /** - * Checks whether two allele lists are in fact the same. - * @param first one list to compare. - * @param second another list to compare. - * - * @throws IllegalArgumentException if if either list is {@code null}. - * - * @return {@code true} iff both list are equal. - */ - public static boolean equals(final AlleleList first, final AlleleList second) { - if (first == null || second == null) - throw new IllegalArgumentException("no null list allowed"); - final int alleleCount = first.alleleCount(); - if (alleleCount != second.alleleCount()) - return false; - - for (int i = 0; i < alleleCount; i++) { - final A firstSample = first.alleleAt(i); - if (firstSample == null) - throw new IllegalStateException("no null samples allowed in sample-lists: first list at " + i); - final A secondSample = second.alleleAt(i); - if (secondSample == null) - throw new IllegalArgumentException("no null samples allowed in sample-list: second list at " + i); - if (!firstSample.equals(secondSample)) - return false; - } - - return true; - } - - /** - * Resolves the index of the reference allele in an allele-list. - * - *

- * If there is no reference allele, it returns -1. If there is more than one reference allele, - * it returns the first occurrence (lowest index). - *

- * - * @param list the search allele-list. - * @param
allele component type. - * - * @throws IllegalArgumentException if {@code list} is {@code null}. - * - * @return -1 if there is no reference allele, or a values in [0,{@code list.alleleCount()}). - */ - public static int indexOfReference(final AlleleList list) { - if (list == null) - throw new IllegalArgumentException("the input list cannot be null"); - final int alleleCount = list.alleleCount(); - for (int i = 0; i < alleleCount; i++) - if (list.alleleAt(i).isReference()) - return i; - return -1; - } - - - /** - * Returns a {@link java.util.List} unmodifiable view of a allele-list - * @param list the sample-list to wrap. - * - * @throws IllegalArgumentException if {@code list} is {@code null}. - * - * @return never {@code null}. - */ - public static List asList(final AlleleList list) { - if (list == null) - throw new IllegalArgumentException("the list cannot be null"); - return new AsList(list); - } - - /** - * Returns an unmodifiable empty allele-list. - * @param the allele class. - * @return never {@code null}. - */ - @SuppressWarnings("unchecked") - public static final AlleleList emptyList() { - return EMPTY_LIST; - } - - /** - * Simple list view of a sample-list. - */ - private static class AsList extends AbstractList { - - private final AlleleList list; - - private AsList(final AlleleList list) { - this.list = list; - - } - - @Override - public A get(int index) { - return list.alleleAt(index); - } - - @Override - public int size() { - return list.alleleCount(); - } - } - - - /** - * Returns a permutation between two allele lists. - * @param original the original allele list. - * @param target the target allele list. - * @param the allele type. - * - * @throws IllegalArgumentException if {@code original} or {@code target} is {@code null}, or - * elements in {@code target} is not contained in {@code original} - * - * @return never {@code null} - */ - public static AlleleListPermutation permutation(final AlleleList original, final AlleleList target) { - if (equals(original,target)) - return new NonPermutation<>(original); - else - return new ActualPermutation<>(original,target); - } - - private static class NonPermutation implements AlleleListPermutation { - - private final AlleleList list; - - public NonPermutation(final AlleleList original) { - list = original; - } - - @Override - public boolean isPartial() { - return false; - } - - @Override - public boolean isNonPermuted() { - return true; - } - - @Override - public int toIndex(int fromIndex) { - return fromIndex; - } - - @Override - public int fromIndex(int toIndex) { - return toIndex; - } - - @Override - public int fromSize() { - return list.alleleCount(); - } - - @Override - public int toSize() { - return list.alleleCount(); - } - - @Override - public List fromList() { - return asList(list); - } - - @Override - public java.util.List toList() { - return asList(list); - } - - - @Override - public int alleleCount() { - return list.alleleCount(); - } - - @Override - public int alleleIndex(final A allele) { - return list.alleleIndex(allele); - } - - @Override - public A alleleAt(final int index) { - return list.alleleAt(index); - } - } - - private static class ActualPermutation implements AlleleListPermutation { - - private final AlleleList from; - - private final AlleleList to; - - private final int[] fromIndex; - - private final boolean nonPermuted; - - private final boolean isPartial; - - private ActualPermutation(final AlleleList original, final AlleleList target) { - this.from = original; - this.to = target; - final int toSize = target.alleleCount(); - final int fromSize = original.alleleCount(); - if (fromSize < toSize) - throw new IllegalArgumentException("target allele list is not a permutation of the original allele list"); - - fromIndex = new int[toSize]; - boolean nonPermuted = fromSize == toSize; - this.isPartial = !nonPermuted; - for (int i = 0; i < toSize; i++) { - final int originalIndex = original.alleleIndex(target.alleleAt(i)); - if (originalIndex < 0) - throw new IllegalArgumentException("target allele list is not a permutation of the original allele list"); - fromIndex[i] = originalIndex; - nonPermuted &= originalIndex == i; - } - - this.nonPermuted = nonPermuted; - } - - @Override - public boolean isPartial() { - return isPartial; - } - - @Override - public boolean isNonPermuted() { - return nonPermuted; - } - - @Override - public int toIndex(int fromIndex) { - return to.alleleIndex(from.alleleAt(fromIndex)); - } - - @Override - public int fromIndex(int toIndex) { - return fromIndex[toIndex]; - } - - @Override - public int fromSize() { - return from.alleleCount(); - } - - @Override - public int toSize() { - return to.alleleCount(); - } - - @Override - public List fromList() { - return asList(from); - } - - @Override - public List toList() { - return asList(to); - } - - @Override - public int alleleCount() { - return to.alleleCount(); - } - - @Override - public int alleleIndex(final A allele) { - return to.alleleIndex(allele); - } - - @Override - public A alleleAt(final int index) { - return to.alleleAt(index); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleList.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleList.java deleted file mode 100644 index 9238af7f6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedAlleleList.java +++ /dev/null @@ -1,95 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.genotyper; - -import htsjdk.variant.variantcontext.Allele; -import org.broadinstitute.gatk.utils.collections.IndexedSet; - -import java.util.Collection; - -/** - * Allele list implementation using and indexed-set. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class IndexedAlleleList implements AlleleList { - - private final IndexedSet alleles; - - /** - * Constructs a new empty allele-list - */ - public IndexedAlleleList() { - alleles = new IndexedSet<>(); - } - - /** - * Constructs a new allele-list from an array of alleles. - * - *

- * Repeats in the input array will be ignored (keeping the first one). The order of alleles in the - * resulting list is the same as in the natural traversal of the input collection. - * - *

- * @param alleles the original allele array - * - * @throws java.lang.IllegalArgumentException if {@code alleles} is {@code null} or contains {@code null}s. - */ - public IndexedAlleleList(final A ... alleles) { - this.alleles = new IndexedSet<>(alleles); - } - - /** - * Constructs a new allele-list from a collection of alleles. - * - *

- * Repeats in the input collection will be ignored (keeping the first one). The order of alleles in the - * resulting list is the same as in the natural traversal of the input collection. - * - *

- * @param alleles the original allele collection - * - * @throws java.lang.IllegalArgumentException if {@code alleles} is {@code null} or contains {@code null}s. - */ - public IndexedAlleleList(final Collection
alleles) { - this.alleles = new IndexedSet<>(alleles); - } - - @Override - public int alleleCount() { - return alleles.size(); - } - - @Override - public int alleleIndex(final A allele) { - return alleles.indexOf(allele); - } - - @Override - public A alleleAt(final int index) { - return alleles.get(index); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleList.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleList.java deleted file mode 100644 index 94022c827..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/IndexedSampleList.java +++ /dev/null @@ -1,96 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.genotyper; - -import org.broadinstitute.gatk.utils.collections.IndexedSet; - -import java.util.Collection; - -/** - * Simple implementation of a sample-list using and indexed-set. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class IndexedSampleList implements SampleList { - - private final IndexedSet samples; - - /** - * Constructs an empty sample-list. - */ - public IndexedSampleList() { - samples = new IndexedSet<>(0); - } - - /** - * Constructs a sample-list from a collection of samples. - * - *

- * Repeats in the input collection are ignored (just the first occurrence is kept). - * Sample names will be sorted based on the traversal order - * of the original collection. - *

- * - * @param samples input sample collection. - * - * @throws IllegalArgumentException if {@code samples} is {@code null} or it contains {@code nulls}. - */ - public IndexedSampleList(final Collection samples) { - this.samples = new IndexedSet<>(samples); - } - - /** - * Constructs a sample-list from an array of samples. - * - *

- * Repeats in the input array are ignored (just the first occurrence is kept). - * Sample names will be sorted based on the traversal order - * of the original array. - *

- * - * @param samples input sample array. - * - * @throws IllegalArgumentException if {@code samples} is {@code null} or it contains {@code nulls}. - */ - public IndexedSampleList(final String ... samples) { - this.samples = new IndexedSet<>(samples); - } - - @Override - public int sampleCount() { - return samples.size(); - } - - @Override - public int sampleIndex(final String sample) { - return samples.indexOf(sample); - } - - @Override - public String sampleAt(int sampleIndex) { - return samples.get(sampleIndex); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleList.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleList.java deleted file mode 100644 index 29cb4287c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleList.java +++ /dev/null @@ -1,42 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.genotyper; - -/** - * A indexed set of samples. - * - *

- * Implementing classes must guarantee that the sample list will remain constant through the life of the object. - *

- */ -public interface SampleList { - - public int sampleCount(); - - public int sampleIndex(final String sample); - - public String sampleAt(final int sampleIndex); -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtils.java deleted file mode 100644 index 2071f5de0..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/SampleListUtils.java +++ /dev/null @@ -1,224 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.tools.walkers.genotyper; - -import java.util.*; - -/** - * Some utility operations on sample lists. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class SampleListUtils { - - private static final SampleList EMPTY_LIST = new SampleList() { - - @Override - public int sampleCount() { - return 0; - } - - @Override - public int sampleIndex(String sample) { - return -1; - } - - @Override - public String sampleAt(final int sampleIndex) { - throw new IllegalArgumentException("index is out of valid range"); - } - }; - - /** - * Empty list. - * - * @return never {@code null} - */ - public static SampleList emptyList() { - return EMPTY_LIST; - } - - /** - * Checks whether two sample lists are in fact the same. - * @param first one list to compare. - * @param second another list to compare. - * - * @throws IllegalArgumentException if if either list is {@code null}. - * - * @return {@code true} iff both list are equal. - */ - public static boolean equals(final SampleList first, final SampleList second) { - if (first == null || second == null) - throw new IllegalArgumentException("no null list allowed"); - final int sampleCount = first.sampleCount(); - if (sampleCount != second.sampleCount()) - return false; - - for (int i = 0; i < sampleCount; i++) { - final String firstSample = first.sampleAt(i); - if (firstSample == null) - throw new IllegalStateException("no null samples allowed in sample-lists: first list at " + i); - final String secondSample = second.sampleAt(i); - if (secondSample == null) - throw new IllegalArgumentException("no null samples allowed in sample-list: second list at " + i); - if (!firstSample.equals(secondSample)) - return false; - } - return true; - } - - /** - * Returns a {@link List} unmodifiable view of a sample-list - * @param list the sample-list to wrap. - * - * @throws IllegalArgumentException if {@code list} is {@code null}. - * - * @return never {@code null}. - */ - public static List asList(final SampleList list) { - if (list == null) - throw new IllegalArgumentException("the list cannot be null"); - return new AsList(list); - } - - /** - * Returns a {@link Set} unmodifiable view of the sample-list - * - * @param list the sample-list to wrap. - * - * @throws IllegalArgumentException if {@code list} is {@code null} - */ - public static Set asSet(final SampleList list) { - if (list == null) - throw new IllegalArgumentException("the list cannot be null"); - return new AsSet(list); - } - - /** - * Creates a list with a single sample. - * - * @param sampleName the sample name. - * @return never {@code sampleName} - */ - public static SampleList singletonList(final String sampleName) { - if (sampleName == null) - throw new IllegalArgumentException("the sample name cannot be null"); - return new SampleList() { - - @Override - public int sampleCount() { - return 1; - } - - @Override - public int sampleIndex(final String sample) { - return sampleName.equals(sample) ? 0 : -1; - } - - @Override - public String sampleAt(int sampleIndex) { - if (sampleIndex == 0) - return sampleName; - throw new IllegalArgumentException("index is out of bounds"); - } - }; - } - - /** - * Simple list view of a sample-list. - */ - private static class AsList extends AbstractList { - - private final SampleList list; - - private AsList(final SampleList list) { - this.list = list; - - } - - @Override - public String get(int index) { - return list.sampleAt(index); - } - - @Override - public int size() { - return list.sampleCount(); - } - } - - /** - * Simple set view of a sample-list - */ - private static class AsSet extends AbstractSet { - - private final SampleList list; - - private AsSet(final SampleList list) { - this.list = list; - - } - - @Override - public Iterator iterator() { - return new Iterator() { - private int index = 0; - - @Override - public boolean hasNext() { - return index < list.sampleCount(); - } - - @Override - public String next() { - if (index >= list.sampleCount()) - throw new NoSuchElementException("iterating beyond sample list end"); - return list.sampleAt(index++); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("unsupported operation exception"); - } - }; - } - - @Override - public int size() { - return list.sampleCount(); - } - - @Override - public boolean contains(final Object obj) { - if (obj == null) - return false; - else if (obj instanceof String) - return list.sampleIndex(((String)obj)) >= 0; - else - return false; - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDoclet.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDoclet.java new file mode 100644 index 000000000..831655f56 --- /dev/null +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDoclet.java @@ -0,0 +1,64 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.help; + +import com.sun.javadoc.RootDoc; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.tools.walkers.qc.DocumentationTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeatureHandler; +import org.broadinstitute.gatk.utils.help.GATKDoclet; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +/** + * GATKDocs for walkers. + * Specifically, allows testing of documentation. + */ +public class WalkerDoclet extends GATKDoclet { + /** + * Any class that's in this list will be included in the documentation + * when the -test argument is provided. Useful for debugging. + */ + private static final List> testOnlyKeepers = Arrays.asList( + DocumentationTest.class, CommandLineGATK.class, UserException.class); + + @Override + protected List> getTestOnlyKeepers() { + return testOnlyKeepers; + } + + @Override + protected DocumentedGATKFeatureHandler createDocumentedGATKFeatureHandler() { + return new WalkerDocumentationHandler(); + } + + public static boolean start(RootDoc rootDoc) throws IOException { + return new WalkerDoclet().startProcessDocs(rootDoc); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java new file mode 100644 index 000000000..bd31f09d8 --- /dev/null +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/help/WalkerDocumentationHandler.java @@ -0,0 +1,345 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.help; + +import org.apache.commons.lang.StringUtils; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.engine.walkers.*; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.utils.help.GATKDocUtils; +import org.broadinstitute.gatk.utils.help.GenericDocumentationHandler; +import org.broadinstitute.gatk.utils.help.HelpConstants; + +import java.lang.annotation.Annotation; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; + +public class WalkerDocumentationHandler extends GenericDocumentationHandler { + private final static String CMDLINE_GATK_URL = HelpConstants.GATK_DOCS_URL + "org_broadinstitute_gatk_engine_CommandLineGATK.php"; + + @Override + protected CommandLineProgram createCommandLineProgram() { + return new CommandLineGATK(); + } + + /** + * Umbrella function that groups the collection of values for specific annotations applied to an + * instance of class c. Lists of collected values are added directly to the "toProcess" object. + * Requires being able to instantiate the class. + * + * @param classToProcess the object to instantiate and query for the annotation + * @param root the root of the document handler, to which we'll store collected annotations + */ + @Override + protected void getClazzAnnotations(Class classToProcess, Map root) { + // + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(classToProcess); + if (instance != null) { + final Class myClass = instance.getClass(); + // Get parallelism options + final HashSet> parallelOptions = getParallelism(myClass, new HashSet>()); + root.put("parallel", parallelOptions); + // Get annotation info (what type of annotation, standard etc.) + final HashSet annotInfo = getAnnotInfo(myClass, new HashSet()); + root.put("annotinfo", StringUtils.join(annotInfo, ", ")); + // Get annotation field (whether it goes in INFO or FORMAT) + root.put("annotfield", getAnnotField(myClass)); + // Get walker type if applicable + root.put("walkertype", getWalkerType(myClass)); + // Get partition type if applicable + root.put("partitiontype", getPartitionType(myClass)); + // Get read filter annotations (ReadFilters) if applicable + final HashSet> bucket= getReadFilters(myClass, new HashSet>()); + root.put("readfilters", bucket); + // Get default downsampling settings + final HashMap dsSettings = getDownSamplingSettings(myClass, new HashMap()); + root.put("downsampling", dsSettings); + // Get reference window size settings + final HashMap refwindow = getRefWindow(myClass, new HashMap()); + root.put("refwindow", refwindow); + // Get ActiveRegion size settings + final HashMap activeRegion = getActiveRegion(myClass, new HashMap()); + root.put("activeregion", activeRegion); + // Get annotation header line description if applicable + final Object annotDescriptLines = getAnnotDescript(instance, myClass); + root.put("annotdescript", annotDescriptLines); + + // anything else? + } else { + // put empty items to avoid blowups + root.put("parallel", new HashSet()); + root.put("annotinfo", ""); + root.put("annotfield", ""); + root.put("walkertype", ""); + root.put("partitiontype", ""); + root.put("readfilters", new HashSet>()); + root.put("downsampling", new HashMap()); + root.put("refwindow", new HashMap()); + root.put("activeregion", new HashMap()); + root.put("annotdescript", new ArrayList>()); + } + } + + /** + * Utility function that looks up annotation descriptions if applicable. + * + * @param myClass the class to query + * @return a hash map of descriptions, otherwise an empty map + */ + private Object getAnnotDescript(Object instance, Class myClass) { + // + // Check if the class has the method we want + for (Method classMethod : myClass.getMethods()) { + if (classMethod.toString().contains("getDescriptions") && classMethod.toString().contains("annotator")) { + try { + return classMethod.invoke(instance); + } catch (IllegalArgumentException e) { + } catch (IllegalAccessException e) { + } catch (InvocationTargetException e) { + } + } + } + return null; + } + + /** + * Utility function that checks which parallelism options are available for an instance of class c. + * + * @param myClass the class to query for the interfaces + * @param parallelOptions an empty HashSet in which to collect the info + * @return a hash set of parallelism options, otherwise an empty set + */ + private HashSet> getParallelism(Class myClass, HashSet> parallelOptions) { + // + // Retrieve interfaces + Class[] implementedInterfaces = myClass.getInterfaces(); + for (Class intfClass : implementedInterfaces) { + final HashMap nugget = new HashMap(); + if (intfClass.getSimpleName().equals("TreeReducible")) { + nugget.put("name", intfClass.getSimpleName()); + nugget.put("arg", HelpConstants.ARG_TREEREDUCIBLE); + nugget.put("link", CMDLINE_GATK_URL + "#" + HelpConstants.ARG_TREEREDUCIBLE); + } else if (intfClass.getSimpleName().equals("NanoSchedulable")) { + nugget.put("name", intfClass.getSimpleName()); + nugget.put("arg", HelpConstants.ARG_NANOSCHEDULABLE); + nugget.put("link", CMDLINE_GATK_URL + "#" + HelpConstants.ARG_NANOSCHEDULABLE); + } else { + continue; + } + parallelOptions.add(nugget); + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return parallelOptions; + } + return getParallelism(mySuperClass, parallelOptions); + } + + /** + * Utility function that looks up whether the annotation goes in INFO or FORMAT field. + * + * @param myClass the class to query for the interfaces + * @return a String specifying the annotation field + */ + private final String getAnnotField(Class myClass) { + // + // Look up superclasses recursively until we find either + // GenotypeAnnotation or InfoFieldAnnotation + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass == InfoFieldAnnotation.class) { + return "INFO (variant-level)"; + } else if (mySuperClass == GenotypeAnnotation.class) { + return "FORMAT (sample genotype-level)"; + } else if (mySuperClass.getSimpleName().equals("Object")) { + return ""; + } + return getAnnotField(mySuperClass); + } + + /** + * Utility function that determines the annotation type for an instance of class c. + * + * @param myClass the class to query for the interfaces + * @param annotInfo an empty HashSet in which to collect the info + * @return a hash set of the annotation types, otherwise an empty set + */ + private HashSet getAnnotInfo(Class myClass, HashSet annotInfo) { + // + // Retrieve interfaces + Class[] implementedInterfaces = myClass.getInterfaces(); + for (Class intfClass : implementedInterfaces) { + if (intfClass.getName().contains("Annotation")) { + annotInfo.add(intfClass.getSimpleName()); + } + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return annotInfo; + } + return getAnnotInfo(mySuperClass, annotInfo); + } + + /** + * Utility function that determines the default downsampling settings for an instance of class c. + * + * @param myClass the class to query for the settings + * @param dsSettings an empty HashMap in which to collect the info + * @return a hash set of the downsampling settings, otherwise an empty set + */ + private HashMap getDownSamplingSettings(Class myClass, HashMap dsSettings) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(Downsample.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(Downsample.class); + if(thisAnnotation instanceof Downsample) { + final Downsample dsAnnotation = (Downsample) thisAnnotation; + dsSettings.put("by", dsAnnotation.by().toString()); + dsSettings.put("to_cov", dsAnnotation.toCoverage()); + } + } + return dsSettings; + } + + /** + * Utility function that determines the reference window size for an instance of class c. + * + * @param myClass the class to query for the settings + * @param refWindow an empty HashMap in which to collect the info + * @return a HashMap of the window start and stop, otherwise an empty HashMap + */ + private HashMap getRefWindow(Class myClass, HashMap refWindow) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(Reference.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(Reference.class); + if(thisAnnotation instanceof Reference) { + final Reference refAnnotation = (Reference) thisAnnotation; + refWindow.put("start", refAnnotation.window().start()); + refWindow.put("stop", refAnnotation.window().stop()); + } + } + return refWindow; + } + + /** + * Utility function that determines the ActiveRegion settings for an instance of class c. + * + * @param myClass the class to query for the settings + * @param activeRegion an empty HashMap in which to collect the info + * @return a HashMap of the ActiveRegion parameters, otherwise an empty HashMap + */ + private HashMap getActiveRegion(Class myClass, HashMap activeRegion) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(ActiveRegionTraversalParameters.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(ActiveRegionTraversalParameters.class); + if(thisAnnotation instanceof ActiveRegionTraversalParameters) { + final ActiveRegionTraversalParameters arAnnotation = (ActiveRegionTraversalParameters) thisAnnotation; + activeRegion.put("ext", arAnnotation.extension()); + activeRegion.put("max", arAnnotation.maxRegion()); + activeRegion.put("min", arAnnotation.minRegion()); + } + } + return activeRegion; + } + + /** + * Utility function that determines the partition type of an instance of class c. + * + * @param myClass the class to query for the annotation + * @return the partition type if applicable, otherwise an empty string + */ + private String getPartitionType(Class myClass) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(PartitionBy.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(PartitionBy.class); + if(thisAnnotation instanceof PartitionBy) { + final PartitionBy partAnnotation = (PartitionBy) thisAnnotation; + return partAnnotation.value().toString(); + } + } + return ""; + } + + /** + * Utility function that determines the type of walker subclassed by an instance of class c. + * + * @param myClass the class to query for the annotation + * @return the type of walker if applicable, otherwise an empty string + */ + private String getWalkerType(Class myClass) { + // + // Look up superclasses recursively until we find either Walker or Object + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Walker")) { + return myClass.getSimpleName(); + } else if (mySuperClass.getSimpleName().equals("Object")) { + return ""; + } + return getWalkerType(mySuperClass); + } + + /** + * Utility function that finds the values of ReadFilters annotation applied to an instance of class c. + * + * @param myClass the class to query for the annotation + * @param bucket a container in which we store the annotations collected + * @return a hash set of values, otherwise an empty set + */ + private HashSet> getReadFilters(Class myClass, HashSet> bucket) { + // + // Retrieve annotation + if (myClass.isAnnotationPresent(ReadFilters.class)) { + final Annotation thisAnnotation = myClass.getAnnotation(ReadFilters.class); + if(thisAnnotation instanceof ReadFilters) { + final ReadFilters rfAnnotation = (ReadFilters) thisAnnotation; + for (Class filter : rfAnnotation.value()) { + // make hashmap of simplename and url + final HashMap nugget = new HashMap(); + nugget.put("name", filter.getSimpleName()); + nugget.put("filename", GATKDocUtils.phpFilenameForClass(filter)); + bucket.add(nugget); + } + } + } + // Look up superclasses recursively + final Class mySuperClass = myClass.getSuperclass(); + if (mySuperClass.getSimpleName().equals("Object")) { + return bucket; + } + return getReadFilters(mySuperClass, bucket); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java index a3848dccc..fd876991f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CheckPileup.java @@ -30,9 +30,9 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import org.broadinstitute.gatk.engine.walkers.Requires; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java index 1500ce37d..8ba387ca5 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountBases.java @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.tools.walkers.qc; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.engine.walkers.Requires; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java index 221c4a9a2..096ce70dc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountIntervals.java @@ -31,9 +31,9 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RefWalker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.collections.Pair; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java index 51c161731..5987199b1 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountLoci.java @@ -27,9 +27,9 @@ package org.broadinstitute.gatk.tools.walkers.qc; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import org.broadinstitute.gatk.engine.walkers.NanoSchedulable; import org.broadinstitute.gatk.engine.walkers.TreeReducible; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java index d665e618e..293cfd0ed 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountMales.java @@ -27,8 +27,8 @@ package org.broadinstitute.gatk.tools.walkers.qc; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Gender; import org.broadinstitute.gatk.engine.samples.Sample; import org.broadinstitute.gatk.engine.walkers.DataSource; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java index e068ff7ad..3e9e9db39 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODs.java @@ -33,10 +33,10 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; import org.broadinstitute.gatk.engine.walkers.NanoSchedulable; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java index 40471b50b..8161d4387 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountRODsByRef.java @@ -30,9 +30,9 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RefWalker; import org.broadinstitute.gatk.utils.collections.ExpandingArrayList; import org.broadinstitute.gatk.utils.collections.Pair; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java index 998448ecf..ccb714b45 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReadEvents.java @@ -28,9 +28,9 @@ package org.broadinstitute.gatk.tools.walkers.qc; import htsjdk.samtools.CigarOperator; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.engine.walkers.Requires; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java index 33d22cab1..6503766b6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountReads.java @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.tools.walkers.qc; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.NanoSchedulable; import org.broadinstitute.gatk.engine.walkers.ReadWalker; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java index 90a131c2a..10094ac6a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/CountTerminusEvent.java @@ -29,8 +29,8 @@ import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.engine.walkers.Requires; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/DocumentationTest.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/DocumentationTest.java index b5a1e742d..9679baac3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/DocumentationTest.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/DocumentationTest.java @@ -29,9 +29,9 @@ import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ErrorThrowing.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ErrorThrowing.java index 6e872e3d4..7def2e0e3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ErrorThrowing.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ErrorThrowing.java @@ -25,17 +25,16 @@ package org.broadinstitute.gatk.tools.walkers.qc; +import org.broadinstitute.gatk.engine.walkers.FailMethod; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.NanoSchedulable; import org.broadinstitute.gatk.engine.walkers.RefWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; @@ -50,12 +49,6 @@ public class ErrorThrowing extends RefWalker implements TreeRed @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) public FailMethod failMethod = FailMethod.MAP; - public enum FailMethod { - MAP, - REDUCE, - TREE_REDUCE - } - // // Template code to allow us to build the walker, doesn't actually do anything // @@ -65,7 +58,7 @@ public class ErrorThrowing extends RefWalker implements TreeRed return null; if ( failMethod == FailMethod.MAP ) - fail(); + FailMethod.fail(exceptionToThrow); return 0; } @@ -78,33 +71,13 @@ public class ErrorThrowing extends RefWalker implements TreeRed @Override public Integer reduce(Integer value, Integer sum) { if ( value != null && failMethod == FailMethod.REDUCE ) - fail(); + FailMethod.fail(exceptionToThrow); return sum; } public Integer treeReduce(final Integer lhs, final Integer rhs) { if ( failMethod == FailMethod.TREE_REDUCE ) - fail(); + FailMethod.fail(exceptionToThrow); return rhs; } - - private void fail() { - if ( exceptionToThrow.equals("UserException") ) { - throw new UserException("UserException"); - } else if ( exceptionToThrow.equals("NullPointerException") ) { - throw new NullPointerException(); - } else if ( exceptionToThrow.equals("ReviewedGATKException") ) { - throw new ReviewedGATKException("ReviewedGATKException"); - } else if ( exceptionToThrow.equals("SamError1") ) { - throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); - } else if ( exceptionToThrow.equals("SamError2") ) { - throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); - } else if ( exceptionToThrow.equals("NoSpace1") ) { - throw new htsjdk.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); - } else if ( exceptionToThrow.equals("NoSpace2") ) { - throw new htsjdk.samtools.SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); - } else { - throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); - } - } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java index 83c2cc48b..7bd51249a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/FlagStat.java @@ -27,8 +27,8 @@ package org.broadinstitute.gatk.tools.walkers.qc; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.NanoSchedulable; import org.broadinstitute.gatk.engine.walkers.ReadWalker; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java index 322cea6d9..8b59812bb 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/Pileup.java @@ -28,9 +28,9 @@ package org.broadinstitute.gatk.tools.walkers.qc; import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.LocusWalker; import org.broadinstitute.gatk.engine.walkers.NanoSchedulable; import org.broadinstitute.gatk.engine.walkers.TreeReducible; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java index 22ab7d1a7..ca9a76ab8 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/PrintRODs.java @@ -30,9 +30,9 @@ import org.broadinstitute.gatk.utils.commandline.Input; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.utils.commandline.RodBinding; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java index ee8b68fbf..0b325e6a4 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/QCRef.java @@ -29,9 +29,9 @@ import htsjdk.samtools.reference.IndexedFastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RefWalker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.exceptions.GATKException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java index 14a1de5f5..d6e2bbe0b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/ReadClippingStats.java @@ -31,8 +31,8 @@ import org.broadinstitute.gatk.utils.commandline.Advanced; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.engine.walkers.Requires; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/RodSystemValidation.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/RodSystemValidation.java index 5f1d39693..4e4d131a2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/RodSystemValidation.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/qc/RodSystemValidation.java @@ -26,11 +26,11 @@ package org.broadinstitute.gatk.tools.walkers.qc; import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java index de3c0dcec..1bbc3a2d6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ClipReads.java @@ -34,9 +34,9 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.engine.walkers.Requires; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java index f271fe900..008a14842 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReads.java @@ -28,18 +28,19 @@ package org.broadinstitute.gatk.tools.walkers.readutils; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMReadGroupRecord; +import org.broadinstitute.gatk.engine.io.NWaySAMFileWriter; import org.broadinstitute.gatk.engine.walkers.*; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.sam.GATKSAMFileWriter; import org.broadinstitute.gatk.engine.iterators.ReadTransformer; import org.broadinstitute.gatk.engine.iterators.ReadTransformersMode; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.baq.BAQ; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -181,7 +182,7 @@ public class PrintReads extends ReadWalker impleme if (!sampleNames.isEmpty()) samplesToChoose.addAll(sampleNames); - random = GenomeAnalysisEngine.getRandomGenerator(); + random = Utils.getRandomGenerator(); if (toolkit != null) { final SAMFileHeader outputHeader = toolkit.getSAMFileHeader().clone(); @@ -193,7 +194,7 @@ public class PrintReads extends ReadWalker impleme //Add the program record (if appropriate) and set up the writer final boolean preSorted = true; if (toolkit.getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { - Utils.setupWriter(out, toolkit, outputHeader, preSorted, this, PROGRAM_RECORD_NAME); + NWaySAMFileWriter.setupWriter(out, toolkit, outputHeader, preSorted, this, PROGRAM_RECORD_NAME); } else { out.writeHeader(outputHeader); out.setPresorted(preSorted); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java index 6f0ee85ff..7e05a10c4 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/ReadAdaptorTrimmer.java @@ -38,8 +38,8 @@ import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.Hidden; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java index f4ee4a499..3ef03c3d6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/readutils/SplitSamFile.java @@ -30,10 +30,11 @@ import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.DataSource; import org.broadinstitute.gatk.engine.walkers.ReadWalker; import org.broadinstitute.gatk.engine.walkers.Requires; @@ -41,7 +42,6 @@ import org.broadinstitute.gatk.engine.walkers.WalkerName; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; import java.util.ArrayList; import java.util.HashMap; @@ -109,7 +109,7 @@ public class SplitSamFile extends ReadWalker implements TreeRedu public final Map> createIntervalTreeByContig(final IntervalBinding intervals) { final Map> byContig = new HashMap>(); - final List locs = intervals.getIntervals(getToolkit()); + final List locs = intervals.getIntervals(getToolkit().getGenomeLocParser()); // set up the map from contig -> interval tree for ( final String contig : getContigNames() ) diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalReportWriter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalReportWriter.java index 7244a9417..281d8d346 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/VariantEvalReportWriter.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.VariantStratifier; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.manager.StratificationManager; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CompOverlap.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CompOverlap.java index 7ebf96e7c..1732b6d8d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CompOverlap.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CompOverlap.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import htsjdk.variant.variantcontext.Allele; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CountVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CountVariants.java index 89b37f0f2..3ef087e51 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CountVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/CountVariants.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelLengthHistogram.java index 8e202a74a..b0d1bbeee 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelLengthHistogram.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelLengthHistogram.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Molten; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelSummary.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelSummary.java index 484541e88..6d2982c1c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelSummary.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/IndelSummary.java @@ -26,9 +26,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import org.broadinstitute.gatk.utils.Utils; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MendelianViolationEvaluator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MendelianViolationEvaluator.java index c01aae1cc..7adcc05c8 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MendelianViolationEvaluator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MendelianViolationEvaluator.java @@ -25,14 +25,14 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.samples.Sample; import org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; -import org.broadinstitute.gatk.utils.MendelianViolation; +import org.broadinstitute.gatk.engine.samples.MendelianViolation; import htsjdk.variant.variantcontext.VariantContext; import java.util.Map; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MultiallelicSummary.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MultiallelicSummary.java index 88543b505..e52923db6 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MultiallelicSummary.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/MultiallelicSummary.java @@ -26,9 +26,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import org.broadinstitute.gatk.utils.Utils; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/PrintMissingComp.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/PrintMissingComp.java index 0d3d4cf27..8beda5c56 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/PrintMissingComp.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/PrintMissingComp.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ThetaVariantEvaluator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ThetaVariantEvaluator.java index 60a488117..c972d9a91 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ThetaVariantEvaluator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ThetaVariantEvaluator.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import htsjdk.variant.variantcontext.Allele; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 1919c5f5e..c5fa247ec 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import org.broadinstitute.gatk.utils.BaseUtils; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ValidationReport.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ValidationReport.java index 664e5f245..337056fb0 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/ValidationReport.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; import htsjdk.variant.vcf.VCFConstants; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantEvaluator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantEvaluator.java index 0984a2e73..a1c5b21f3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantEvaluator.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantEvaluator.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantSummary.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantSummary.java index 91473306a..3309b815c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantSummary.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/evaluators/VariantSummary.java @@ -27,9 +27,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.evaluators; import htsjdk.samtools.util.IntervalTree; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval; import org.broadinstitute.gatk.tools.walkers.varianteval.util.Analysis; import org.broadinstitute.gatk.tools.walkers.varianteval.util.DataPoint; @@ -164,7 +164,7 @@ public class VariantSummary extends VariantEvaluator implements StandardEval { if ( walker.knownCNVsFile != null ) { knownCNVs = walker.createIntervalTreeByContig(walker.knownCNVsFile); - final List locs = walker.knownCNVsFile.getIntervals(walker.getToolkit()); + final List locs = walker.knownCNVsFile.getIntervals(walker.getToolkit().getGenomeLocParser()); logger.info(String.format("Creating known CNV list %s containing %d intervals covering %d bp", walker.knownCNVsFile.getSource(), locs.size(), IntervalUtils.intervalSize(locs))); } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleCount.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleCount.java index 1f7ed14fc..d7173d921 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleCount.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantSummary; import htsjdk.variant.vcf.VCFConstants; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleFrequency.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleFrequency.java index 349979ada..b0108aa58 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/AlleleFrequency.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CompRod.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CompRod.java index f131ca751..72bfa4b8c 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CompRod.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CompRod.java @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.Collections; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Contig.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Contig.java index f90e7c5bc..cb64651ef 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Contig.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Contig.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.Arrays; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CpG.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CpG.java index 97e5e7222..72ea18718 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CpG.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/CpG.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.ArrayList; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Degeneracy.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Degeneracy.java index 03cba8cfc..2cf50e38f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Degeneracy.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.ArrayList; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/EvalRod.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/EvalRod.java index 8fdd00726..1f01fc611 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/EvalRod.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/EvalRod.java @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.Arrays; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Filter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Filter.java index c37f0031c..1ee8d0294 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Filter.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Filter.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.ArrayList; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/FunctionalClass.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/FunctionalClass.java index 08ff9d4f5..abb517a01 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/FunctionalClass.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.SnpEff; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IndelSize.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IndelSize.java index e5cb24077..6efc332a8 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IndelSize.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IndelSize.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.Collections; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IntervalStratification.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IntervalStratification.java index 8ee4e79b2..5b6a3e078 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IntervalStratification.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/IntervalStratification.java @@ -27,8 +27,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; import htsjdk.samtools.util.IntervalTree; import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.GenomeLoc; import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.interval.IntervalUtils; @@ -62,7 +62,7 @@ public class IntervalStratification extends VariantStratifier { if ( getVariantEvalWalker().intervalsFile == null ) throw new UserException.MissingArgument("stratIntervals", "Must be provided when IntervalStratification is enabled"); - final List locs = getVariantEvalWalker().intervalsFile.getIntervals(getVariantEvalWalker().getToolkit()); + final List locs = getVariantEvalWalker().intervalsFile.getIntervals(getVariantEvalWalker().getToolkit().getGenomeLocParser()); if ( locs.isEmpty() ) throw new UserException.BadArgumentValue("stratIntervals", "Contains no intervals. Perhaps the file is malformed or empty?"); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/JexlExpression.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/JexlExpression.java index 00fec2a81..746e4967d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/JexlExpression.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/JexlExpression.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.util.SortableJexlVCMatchExp; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextUtils; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Novelty.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Novelty.java index 0114bf24d..349535dc8 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Novelty.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Novelty.java @@ -26,8 +26,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.*; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/OneBPIndel.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/OneBPIndel.java index 7ad45e040..9884952fa 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/OneBPIndel.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/OneBPIndel.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.Arrays; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Sample.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Sample.java index bd0b6f1e2..90a6ece92 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Sample.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/Sample.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantSummary; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/SnpEffPositionModifier.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/SnpEffPositionModifier.java index c2ddd803f..549738ad7 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/SnpEffPositionModifier.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/SnpEffPositionModifier.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.annotator.SnpEff; import org.broadinstitute.gatk.tools.walkers.annotator.SnpEff.EffectType; import org.broadinstitute.gatk.tools.walkers.annotator.SnpEff.InfoFieldKey; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/TandemRepeat.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/TandemRepeat.java index 6eba4b450..493f4fe36 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/TandemRepeat.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/TandemRepeat.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantStratifier.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantStratifier.java index 0832ebd13..a7a0543ce 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantStratifier.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantStratifier.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.manager.Stratifier; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantType.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantType.java index 0ba5b6077..46a9ab97b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantType.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/stratifications/VariantType.java @@ -25,8 +25,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.stratifications; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import htsjdk.variant.variantcontext.VariantContext; import java.util.Collections; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/EvaluationContext.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/EvaluationContext.java index 7e9f2da36..e8b7fe2ec 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/EvaluationContext.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/EvaluationContext.java @@ -25,9 +25,9 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.util; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; import org.broadinstitute.gatk.tools.walkers.varianteval.stratifications.manager.StratificationManager; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/VariantEvalUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/VariantEvalUtils.java index 6f623d48b..53b143b80 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/varianteval/util/VariantEvalUtils.java @@ -27,8 +27,8 @@ package org.broadinstitute.gatk.tools.walkers.varianteval.util; import org.apache.log4j.Logger; import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.tools.walkers.varianteval.VariantEval; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.StandardEval; import org.broadinstitute.gatk.tools.walkers.varianteval.evaluators.VariantEvaluator; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java index aa69693b8..b6ca7a087 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java @@ -27,18 +27,18 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.engine.io.stubs.VariantContextWriterStub; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; import org.broadinstitute.gatk.engine.walkers.Window; -import org.broadinstitute.gatk.tools.walkers.annotator.ChromosomeCountConstants; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java index 73995bf93..6e8a9e7e9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/FilterLiftedVariants.java @@ -32,14 +32,14 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.VariantContext; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java index d2f251a42..e7a8cd23b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeConcordance.java @@ -27,16 +27,16 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.report.GATKReport; -import org.broadinstitute.gatk.engine.report.GATKReportTable; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFHeader; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java index 641047800..b80c1c4d9 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LeftAlignAndTrimVariants.java @@ -35,19 +35,19 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.collections.Pair; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import htsjdk.variant.variantcontext.*; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java index 710aad576..4448226fc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/LiftoverVariants.java @@ -34,13 +34,13 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.vcf.*; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java index d50b4f20d..b14d2f5b3 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/RandomlySplitVariants.java @@ -25,19 +25,19 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; +import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.commandline.Argument; import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLine; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -134,7 +134,7 @@ public class RandomlySplitVariants extends RodWalker { final Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); for ( final VariantContext vc : vcs ) { - final double random = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); + final double random = Utils.getRandomGenerator().nextDouble(); if(splitToMany){ final int index = (int)(numOfFiles * random); writers[index].add(vc); diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java index c9842c8bc..75f297c10 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectHeaders.java @@ -30,16 +30,16 @@ import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; import org.broadinstitute.gatk.utils.interval.IntervalSetRule; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java index db9d082b7..72dbcdf18 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/SelectVariants.java @@ -27,19 +27,18 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.TreeReducible; -import org.broadinstitute.gatk.tools.walkers.annotator.ChromosomeCountConstants; -import org.broadinstitute.gatk.utils.MendelianViolation; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.utils.variant.ChromosomeCountConstants; +import org.broadinstitute.gatk.engine.samples.MendelianViolation; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.UserException; @@ -528,7 +527,7 @@ public class SelectVariants extends RodWalker implements TreeR } if ( !failedJexlMatch && !justRead && - ( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) { + ( !SELECT_RANDOM_FRACTION || Utils.getRandomGenerator().nextDouble() < fractionRandom ) ) { vcfWriter.add(sub); } } diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java index 6b6e6ca82..6142bc08d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/ValidateVariants.java @@ -31,9 +31,9 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java index 9031bf7c1..c52c408a2 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantValidationAssessor.java @@ -31,13 +31,13 @@ import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java index 1f7b20ca3..b9954221f 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToAllelicPrimitives.java @@ -30,14 +30,14 @@ import org.broadinstitute.gatk.utils.commandline.ArgumentCollection; import org.broadinstitute.gatk.utils.commandline.Output; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java index b51349afe..d7090235d 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToBinaryPed.java @@ -30,15 +30,15 @@ import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.utils.help.HelpConstants; import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java index 9a65a7062..62845501b 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToTable.java @@ -27,18 +27,18 @@ package org.broadinstitute.gatk.tools.walkers.variantutils; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.utils.Utils; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java index 3e0e2ab5c..2e5b9a7b7 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/VariantsToVCF.java @@ -30,22 +30,22 @@ import htsjdk.tribble.Feature; import org.broadinstitute.gatk.utils.commandline.*; import org.broadinstitute.gatk.engine.CommandLineGATK; import org.broadinstitute.gatk.engine.arguments.DbsnpArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.VariantContextAdaptors; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.refdata.VariantContextAdaptors; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; import org.broadinstitute.gatk.engine.walkers.Reference; import org.broadinstitute.gatk.engine.walkers.RodWalker; import org.broadinstitute.gatk.engine.walkers.Window; import org.broadinstitute.gatk.tools.walkers.annotator.VariantOverlapAnnotator; import org.broadinstitute.gatk.utils.BaseUtils; import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.SampleUtils; +import org.broadinstitute.gatk.engine.SampleUtils; import org.broadinstitute.gatk.utils.codecs.hapmap.RawHapMapFeature; import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; +import org.broadinstitute.gatk.engine.GATKVCFUtils; import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; import htsjdk.variant.vcf.*; import org.broadinstitute.gatk.utils.exceptions.UserException; diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/BaseUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/BaseUtils.java deleted file mode 100644 index 194db6829..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/BaseUtils.java +++ /dev/null @@ -1,672 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import htsjdk.samtools.util.StringUtil; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.Random; - -/** - * BaseUtils contains some basic utilities for manipulating nucleotides. - */ -public class BaseUtils { - - public enum Base { - A ('A'), - C ('C'), - G ('G'), - T ('T'), - N ('N'), - D ('D'); - - public byte base; - - private Base(final char base) { - this.base = (byte)base; - } - } - - // todo -- add this to the generalized base abstraction using the Base enum. - public final static byte[] BASES = {'A', 'C', 'G', 'T'}; - public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; - - static private final int[] baseIndexMap = new int[256]; - static { - Arrays.fill(baseIndexMap, -1); - baseIndexMap['A'] = Base.A.ordinal(); - baseIndexMap['a'] = Base.A.ordinal(); - baseIndexMap['*'] = Base.A.ordinal(); // the wildcard character counts as an A - baseIndexMap['C'] = Base.C.ordinal(); - baseIndexMap['c'] = Base.C.ordinal(); - baseIndexMap['G'] = Base.G.ordinal(); - baseIndexMap['g'] = Base.G.ordinal(); - baseIndexMap['T'] = Base.T.ordinal(); - baseIndexMap['t'] = Base.T.ordinal(); - } - - static private final int[] baseIndexWithIupacMap = baseIndexMap.clone(); - static { - baseIndexWithIupacMap['*'] = -1; // the wildcard character is bad - baseIndexWithIupacMap['N'] = Base.N.ordinal(); - baseIndexWithIupacMap['n'] = Base.N.ordinal(); - baseIndexWithIupacMap['R'] = Base.N.ordinal(); - baseIndexWithIupacMap['r'] = Base.N.ordinal(); - baseIndexWithIupacMap['Y'] = Base.N.ordinal(); - baseIndexWithIupacMap['y'] = Base.N.ordinal(); - baseIndexWithIupacMap['M'] = Base.N.ordinal(); - baseIndexWithIupacMap['m'] = Base.N.ordinal(); - baseIndexWithIupacMap['K'] = Base.N.ordinal(); - baseIndexWithIupacMap['k'] = Base.N.ordinal(); - baseIndexWithIupacMap['W'] = Base.N.ordinal(); - baseIndexWithIupacMap['w'] = Base.N.ordinal(); - baseIndexWithIupacMap['S'] = Base.N.ordinal(); - baseIndexWithIupacMap['s'] = Base.N.ordinal(); - baseIndexWithIupacMap['B'] = Base.N.ordinal(); - baseIndexWithIupacMap['b'] = Base.N.ordinal(); - baseIndexWithIupacMap['D'] = Base.N.ordinal(); - baseIndexWithIupacMap['d'] = Base.N.ordinal(); - baseIndexWithIupacMap['H'] = Base.N.ordinal(); - baseIndexWithIupacMap['h'] = Base.N.ordinal(); - baseIndexWithIupacMap['V'] = Base.N.ordinal(); - baseIndexWithIupacMap['v'] = Base.N.ordinal(); - } - - /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or - // a pyrimidine to another pyrimidine nucleotide (C <-> T). - // Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions. - public enum BaseSubstitutionType { - TRANSITION, // A <-> G or C <-> T - TRANSVERSION - } - - /** - * Returns the base substitution type of the 2 state SNP - * - * @param base1 - * @param base2 - * @return - */ - public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) { - BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION; - //System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t); - return t; - } - - public static boolean isTransition(byte base1, byte base2) { - final int b1 = simpleBaseToBaseIndex(base1); - final int b2 = simpleBaseToBaseIndex(base2); - return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() || - b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal(); - } - - public static boolean isTransversion(byte base1, byte base2) { - return !isTransition(base1, base2); - } - - /** - * Private constructor. No instantiating this class! - */ - private BaseUtils() {} - - static public boolean basesAreEqual(byte base1, byte base2) { - return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2); - } - - /** - * Checks whether to bases are the same in fact ignore ambiguous 'N' bases. - * - * @param base1 first base to compare. - * @param base2 second base to compare. - * @return true if {@code base1 == base2} or either is an 'N', false otherwise. - */ - static public boolean basesAreEqualIgnoreAmbiguous(final byte base1, final byte base2) { - if (base1 == base2) return true; - else if (base1 == 'n' || base1 == 'N' || base2 == 'N' || base2 == 'n') return true; - else return false; - } - - /** - * Compare to base arrays ranges checking whether they contain the same bases. - * - *

- * By default two array have equal bases, i.e. {@code length == 0} results results in {@code true}. - *

- * - * @param bases1 first base array to compare. - * @param offset1 position of the first base in bases1 to compare. - * @param bases2 second base array to compare. - * @param offset2 position of the first base in bases2 to compare. - * @param length number of bases to compare. - * - * @throws NullPointerException if {@code bases1} or {@code bases2} is {@code null}. - * @throws ArrayIndexOutOfBoundsException if: - *
    - *
  • {@code offset1} is not within the range [0,{@code bases1.length}) or
  • - *
  • {@code offset2} is not within the range [0,{@code bases2.length}) or
  • - *
  • {@code offset1 + length} is not within the range [0,{@code bases1.length}) or
  • - *
  • {@code offset2 + length} is not within the range [0,{@code bases2.length})
  • - *
- * @return - */ - static public boolean basesAreEqualIgnoreAmbiguous(final byte[] bases1, final int offset1, final byte[] bases2, final int offset2, final int length) { - for (int i = 0; i < length; i++) - if (!basesAreEqualIgnoreAmbiguous(bases1[offset1 + i],bases2[offset2 + i])) return false; - return true; - } - - static public boolean extendedBasesAreEqual(byte base1, byte base2) { - return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); - } - - /** - * @return true iff the bases array contains at least one instance of base - */ - static public boolean containsBase(final byte[] bases, final byte base) { - for ( final byte b : bases ) { - if ( b == base ) - return true; - } - return false; - } - - public static boolean isUpperCase(final byte[] bases) { - for ( byte base : bases ) - if ( ! isUpperCase(base) ) - return false; - return true; - } - - public static boolean isUpperCase(final byte base) { - return base >= 'A' && base <= 'Z'; - } - - public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) { - final int length = bases.length; - final int start = ignoreConversionOfFirstByte ? 1 : 0; - - for ( int i = start; i < length; i++ ) { - final int baseIndex = baseIndexWithIupacMap[bases[i]]; - if ( baseIndex == Base.N.ordinal() ) { - bases[i] = 'N'; - } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { - throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); - } - } - return bases; - } - - /** - * Converts a IUPAC nucleotide code to a pair of bases - * - * @param code - * @return 0, 1, 2, 3, or -1 if the base can't be understood - */ - @Deprecated - static public char[] iupacToBases(char code) { - char[] bases = new char[2]; - switch (code) { - case '*': // the wildcard character counts as an A - case 'A': - case 'a': - bases[0] = bases[1] = 'A'; - break; - case 'C': - case 'c': - bases[0] = bases[1] = 'C'; - break; - case 'G': - case 'g': - bases[0] = bases[1] = 'G'; - break; - case 'T': - case 't': - bases[0] = bases[1] = 'T'; - break; - case 'R': - case 'r': - bases[0] = 'A'; - bases[1] = 'G'; - break; - case 'Y': - case 'y': - bases[0] = 'C'; - bases[1] = 'T'; - break; - case 'S': - case 's': - bases[0] = 'G'; - bases[1] = 'C'; - break; - case 'W': - case 'w': - bases[0] = 'A'; - bases[1] = 'T'; - break; - case 'K': - case 'k': - bases[0] = 'G'; - bases[1] = 'T'; - break; - case 'M': - case 'm': - bases[0] = 'A'; - bases[1] = 'C'; - break; - default: - bases[0] = bases[1] = 'N'; - } - return bases; - } - - /** - * Converts a pair of bases to their IUPAC ambiguity code - * - * @param base1 1st base - * @param base2 2nd base - * @return byte - */ - static public byte basesToIUPAC(final byte base1, final byte base2) { - // ensure that the bases come in order - if ( base2 < base1 ) - return basesToIUPAC(base2, base1); - - // ensure that the bases are regular ones - if ( !isRegularBase(base1) || !isRegularBase(base2) ) - return Base.N.base; - - // IUPAC codes are not needed if the bases are identical - if ( basesAreEqual(base1, base2) ) - return base1; - - if ( base1 == Base.A.base ) - return (byte)(base2 == Base.C.base ? 'M' : (base2 == Base.G.base ? 'R' : 'W')); - - if ( base1 == Base.C.base ) - return (byte)(base2 == Base.G.base ? 'S' : 'Y'); - - // the only possibility left is G/T - return 'K'; - } - - /** - * Converts a simple base to a base index - * - * @param base [AaCcGgTt] - * @return 0, 1, 2, 3, or -1 if the base can't be understood - */ - static public int simpleBaseToBaseIndex(final byte base) { - if ( base < 0 || base >= 256 ) - throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)"); - return baseIndexMap[base]; - } - - /** - * Converts a simple base to a base index - * - * @param base [AaCcGgTt] - * @return 0, 1, 2, 3, or -1 if the base can't be understood - */ - @Deprecated - static public int simpleBaseToBaseIndex(char base) { - return baseIndexMap[base]; - } - - static public int extendedBaseToBaseIndex(byte base) { - switch (base) { - case 'd': - case 'D': - return Base.D.ordinal(); - case 'n': - case 'N': - return Base.N.ordinal(); - - default: - return simpleBaseToBaseIndex(base); - } - } - - @Deprecated - static public boolean isRegularBase( final char base ) { - return simpleBaseToBaseIndex(base) != -1; - } - - static public boolean isRegularBase( final byte base ) { - return simpleBaseToBaseIndex(base) != -1; - } - - static public boolean isAllRegularBases( final byte[] bases ) { - for( final byte base : bases) { - if( !isRegularBase(base) ) { return false; } - } - return true; - } - - static public boolean isNBase(byte base) { - return base == 'N' || base == 'n'; - } - - /** - * Converts a base index to a simple base - * - * @param baseIndex 0, 1, 2, 3 - * @return A, C, G, T, or '.' if the index can't be understood - */ - static public byte baseIndexToSimpleBase(int baseIndex) { - switch (baseIndex) { - case 0: - return 'A'; - case 1: - return 'C'; - case 2: - return 'G'; - case 3: - return 'T'; - default: - return '.'; - } - } - - /** - * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). - * - * @param base the base [AaCcGgTt] - * @return the complementary base, or the input base if it's not one of the understood ones - */ - static public byte simpleComplement(byte base) { - switch (base) { - case 'A': - case 'a': - return 'T'; - case 'C': - case 'c': - return 'G'; - case 'G': - case 'g': - return 'C'; - case 'T': - case 't': - return 'A'; - default: - return base; - } - } - - @Deprecated - static private char simpleComplement(char base) { - return (char) simpleComplement((byte) base); - } - - /** - * Reverse complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) - * - * @param bases the byte array of bases - * @return the reverse complement of the base byte array - */ - static public byte[] simpleReverseComplement(byte[] bases) { - byte[] rcbases = new byte[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[bases.length - 1 - i]); - } - - return rcbases; - } - - /** - * Reverse complement a char array of bases - * - * @param bases the char array of bases - * @return the reverse complement of the char byte array - */ - @Deprecated - static public char[] simpleReverseComplement(char[] bases) { - char[] rcbases = new char[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[bases.length - 1 - i]); - } - - return rcbases; - } - - /** - * Reverse complement a String of bases. Preserves ambiguous bases. - * - * @param bases the String of bases - * @return the reverse complement of the String - */ - @Deprecated - static public String simpleReverseComplement(String bases) { - return new String(simpleReverseComplement(bases.getBytes())); - } - - /** - * Returns the uppercased version of the bases - * - * @param bases the bases - * @return the upper cased version - */ - static public void convertToUpperCase(final byte[] bases) { - StringUtil.toUpperCase(bases); - } - - /** - * Returns the index of the most common base in the basecounts array. To be used with - * pileup.getBaseCounts. - * - * @param baseCounts counts of a,c,g,t in order. - * @return the index of the most common base - */ - static public int mostFrequentBaseIndex(int[] baseCounts) { - int mostFrequentBaseIndex = 0; - for (int baseIndex = 1; baseIndex < 4; baseIndex++) { - if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) { - mostFrequentBaseIndex = baseIndex; - } - } - return mostFrequentBaseIndex; - } - - static public int mostFrequentBaseIndexNotRef(int[] baseCounts, int refBaseIndex) { - int tmp = baseCounts[refBaseIndex]; - baseCounts[refBaseIndex] = -1; - int result = mostFrequentBaseIndex(baseCounts); - baseCounts[refBaseIndex] = tmp; - return result; - } - - static public int mostFrequentBaseIndexNotRef(int[] baseCounts, byte refSimpleBase) { - return mostFrequentBaseIndexNotRef(baseCounts, simpleBaseToBaseIndex(refSimpleBase)); - } - - /** - * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. - * - * @param baseCounts counts of a,c,g,t in order. - * @return the most common base - */ - static public byte mostFrequentSimpleBase(int[] baseCounts) { - return baseIndexToSimpleBase(mostFrequentBaseIndex(baseCounts)); - } - - /** - * For the most frequent base in the sequence, return the percentage of the read it constitutes. - * - * @param sequence the read sequence - * @return the percentage of the read that's made up of the most frequent base - */ - static public double mostFrequentBaseFraction(byte[] sequence) { - int[] baseCounts = new int[4]; - - for (byte base : sequence) { - int baseIndex = simpleBaseToBaseIndex(base); - - if (baseIndex >= 0) { - baseCounts[baseIndex]++; - } - } - - int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts); - - return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length); - } - - // -------------------------------------------------------------------------------- - // - // random bases - // - // -------------------------------------------------------------------------------- - - /** - * Return a random base index (A=0, C=1, G=2, T=3). - * - * @return a random base index (A=0, C=1, G=2, T=3) - */ - static public int getRandomBaseIndex() { - return getRandomBaseIndex(-1); - } - - /** - * Return random bases. - * - * @param length base count and length of returned array. - * - * @throws IllegalArgumentException if {@code length} is less than 0. - * - * @return never {@code null} - */ - @SuppressWarnings("unused") - public static byte[] getRandomBases(final int length) { - if (length < 0) - throw new IllegalArgumentException("length must zero or greater"); - final byte[] result = new byte[length]; - fillWithRandomBases(result); - return result; - } - - /** - * Fills an array with random bases. - * - * @param dest the array to fill. - * - * @throws IllegalArgumentException if {@code result} is {@code null}. - */ - public static void fillWithRandomBases(final byte[] dest) { - fillWithRandomBases(dest,0,dest.length); - } - - /** - * Fill an array section with random bases. - * - * @param dest array to fill. - * @param fromIndex first index to be filled (inclusive). - * @param toIndex index after last to be filled (exclusive). - * - * @throws IllegalArgumentException if {@code dest} is {@code null}, - * {@code fromIndex} or {@code toIndex} is negative, - * {@code fromIndex} or {@code toIndex} are greater than {@code dest} length, - * or {@code fromIndex} greater than {@code toIndex}. - */ - public static void fillWithRandomBases(final byte[] dest, final int fromIndex, final int toIndex) { - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - if (dest == null) - throw new IllegalArgumentException("the dest array cannot be null"); - if (fromIndex > toIndex) - throw new IllegalArgumentException("fromIndex cannot be larger than toIndex"); - if (fromIndex < 0) - throw new IllegalArgumentException("both indexes must be positive"); - if (toIndex > dest.length) - throw new IllegalArgumentException("both indexes must be less or equal to the destination array length"); - - for (int i = fromIndex; i < toIndex; i++) - dest[i] = baseIndexToSimpleBase(rnd.nextInt(4)); - } - - /** - * Return a random base index, excluding some base index. - * - * @param excludeBaseIndex the base index to exclude - * @return a random base index, excluding the one specified (A=0, C=1, G=2, T=3) - */ - static public int getRandomBaseIndex(int excludeBaseIndex) { - int randomBaseIndex = excludeBaseIndex; - - while (randomBaseIndex == excludeBaseIndex) { - randomBaseIndex = GenomeAnalysisEngine.getRandomGenerator().nextInt(4); - } - - return randomBaseIndex; - } - - public static byte getComplement(byte base) { - switch(base) { - case 'a': - case 'A': - return 'T'; - case 'c': - case 'C': - return 'G'; - case 'g': - case 'G': - return 'C'; - case 't': - case 'T': - return 'A'; - case 'n': - case 'N': - return 'N'; - default: - throw new ReviewedGATKException("base must be A, C, G or T. " + (char) base + " is not a valid base."); - } - } - - - /** - * Lexicographical sorting of base arrays {@link Comparator}. - */ - public static final Comparator BASES_COMPARATOR = new Comparator (){ - - @Override - public int compare(final byte[] o1,final byte[] o2) { - final int minLength = Math.min(o1.length,o2.length); - for (int i = 0; i < minLength; i++) { - final int cmp = Byte.compare(o1[i],o2[i]); - if (cmp != 0) return cmp; - } - if (o1.length == o2.length) - return 0; - else if (o1.length == minLength) - return -1; - else - return 1; - } - }; -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/IndelUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/IndelUtils.java deleted file mode 100644 index 81a2bdc52..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/IndelUtils.java +++ /dev/null @@ -1,262 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import htsjdk.variant.variantcontext.VariantContext; - -import java.util.ArrayList; -import java.util.Arrays; - -/** - * Created by IntelliJ IDEA. - * User: delangel - * Date: Feb 3, 2011 - * Time: 2:44:22 PM - * To change this template use File | Settings | File Templates. - */ -public class IndelUtils { - protected final static String[] COLUMN_KEYS; - - - - static { - COLUMN_KEYS= new String[51]; - COLUMN_KEYS[0] = "Novel_A"; - COLUMN_KEYS[1] = "Novel_C"; - COLUMN_KEYS[2] = "Novel_G"; - COLUMN_KEYS[3] = "Novel_T"; - COLUMN_KEYS[4] = "NOVEL_1"; - COLUMN_KEYS[5] = "NOVEL_2"; - COLUMN_KEYS[6] = "NOVEL_3"; - COLUMN_KEYS[7] = "NOVEL_4"; - COLUMN_KEYS[8] = "NOVEL_5"; - COLUMN_KEYS[9] = "NOVEL_6"; - COLUMN_KEYS[10] = "NOVEL_7"; - COLUMN_KEYS[11] = "NOVEL_8"; - COLUMN_KEYS[12] = "NOVEL_9"; - COLUMN_KEYS[13] = "NOVEL_10orMore"; - COLUMN_KEYS[14] = "RepeatExpansion_A"; - COLUMN_KEYS[15] = "RepeatExpansion_C"; - COLUMN_KEYS[16] = "RepeatExpansion_G"; - COLUMN_KEYS[17] = "RepeatExpansion_T"; - COLUMN_KEYS[18] = "RepeatExpansion_AC"; - COLUMN_KEYS[19] = "RepeatExpansion_AG"; - COLUMN_KEYS[20] = "RepeatExpansion_AT"; - COLUMN_KEYS[21] = "RepeatExpansion_CA"; - COLUMN_KEYS[22] = "RepeatExpansion_CG"; - COLUMN_KEYS[23] = "RepeatExpansion_CT"; - COLUMN_KEYS[24] = "RepeatExpansion_GA"; - COLUMN_KEYS[25] = "RepeatExpansion_GC"; - COLUMN_KEYS[26] = "RepeatExpansion_GT"; - COLUMN_KEYS[27] = "RepeatExpansion_TA"; - COLUMN_KEYS[28] = "RepeatExpansion_TC"; - COLUMN_KEYS[29] = "RepeatExpansion_TG"; - COLUMN_KEYS[30] = "EventLength_1"; - COLUMN_KEYS[31] = "EventLength_2"; - COLUMN_KEYS[32] = "EventLength_3"; - COLUMN_KEYS[33] = "EventLength_4"; - COLUMN_KEYS[34] = "EventLength_5"; - COLUMN_KEYS[35] = "EventLength_6"; - COLUMN_KEYS[36] = "EventLength_7"; - COLUMN_KEYS[37] = "EventLength_8"; - COLUMN_KEYS[38] = "EventLength_9"; - COLUMN_KEYS[39] = "EventLength_10orMore"; - COLUMN_KEYS[40] = "NumRepetitions_1"; - COLUMN_KEYS[41] = "NumRepetitions_2"; - COLUMN_KEYS[42] = "NumRepetitions_3"; - COLUMN_KEYS[43] = "NumRepetitions_4"; - COLUMN_KEYS[44] = "NumRepetitions_5"; - COLUMN_KEYS[45] = "NumRepetitions_6"; - COLUMN_KEYS[46] = "NumRepetitions_7"; - COLUMN_KEYS[47] = "NumRepetitions_8"; - COLUMN_KEYS[48] = "NumRepetitions_9"; - COLUMN_KEYS[49] = "NumRepetitions_10orMore"; - COLUMN_KEYS[50] = "Other"; - - } - - private static final int START_IND_NOVEL = 4; - private static final int STOP_IND_NOVEL = 13; - private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; - private static final int IND_FOR_REPEAT_EXPANSION_A = 14; - private static final int IND_FOR_REPEAT_EXPANSION_C = 15; - private static final int IND_FOR_REPEAT_EXPANSION_G = 16; - private static final int IND_FOR_REPEAT_EXPANSION_T = 17; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; - private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; - private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; - private static final int START_IND_FOR_NUM_REPETITION_COUNTS = 40; - private static final int STOP_IND_FOR_NUM_REPETITION_COUNTS = 49; - private static final int IND_FOR_OTHER_EVENT = 50; - private static final int START_IND_NOVEL_PER_BASE = 0; - private static final int STOP_IND_NOVEL_PER_BASE = 3; - - private static String findMinimalEvent(String eventString) { - - // for each length up to given string length, see if event string is a repetition of units of size N - String minEvent = eventString; - for (int k=1; k < eventString.length(); k++) { - if (eventString.length() % k > 0) - continue; - String str = eventString.substring(0,k); - // now see if event string is a repetition of str - int numReps = eventString.length() / k; - String r = ""; - for (int j=0; j < numReps; j++) - r = r.concat(str); - - if (r.matches(eventString)) { - minEvent = str; - break; - } - - } - return minEvent; - } - - public static ArrayList findEventClassificationIndex(VariantContext vc, ReferenceContext ref) { - int eventLength; - - String indelAlleleString; - boolean done = false; - - ArrayList inds = new ArrayList(); - if ( vc.isSimpleInsertion() ) { - indelAlleleString = vc.getAlternateAllele(0).getDisplayString().substring(1); - } else if ( vc.isSimpleDeletion() ) { - indelAlleleString = vc.getReference().getDisplayString().substring(1); - } - else { - inds.add(IND_FOR_OTHER_EVENT); - return inds; - } - - byte[] refBases = ref.getBases(); - - indelAlleleString = findMinimalEvent(indelAlleleString); - eventLength = indelAlleleString.length(); - - // See first if indel is a repetition of bases before current - int indStart = refBases.length/2-eventLength+1; - - int numRepetitions = 0; - while (!done) { - if (indStart < 0) - done = true; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart - eventLength; - } - else - done = true; - - } - } - - // now do it forward - done = false; - indStart = refBases.length/2+1; - while (!done) { - if (indStart + eventLength >= refBases.length) - break; - else { - String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); - if (refPiece.matches(indelAlleleString)) - { - numRepetitions++; - indStart = indStart + eventLength; - } - else - done = true; - - } - } - - if (numRepetitions == 0) { - //unrepeated sequence from surroundings - int ind = START_IND_NOVEL + (eventLength-1); - if (ind > STOP_IND_NOVEL) - ind = STOP_IND_NOVEL; - inds.add(ind); - - if (eventLength == 1) { - // log single base indels additionally by base - String keyStr = "Novel_" + indelAlleleString; - int k; - for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - inds.add(k); - } - } - else { - // log number of repetition counts - int ind = START_IND_FOR_NUM_REPETITION_COUNTS + (numRepetitions-1); - if (ind > STOP_IND_FOR_NUM_REPETITION_COUNTS) - ind = STOP_IND_FOR_NUM_REPETITION_COUNTS; - inds.add(ind); - - ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (eventLength - 1); - if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) - ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; - inds.add(ind); - - // log event length - if (eventLength<=2) { - // for single or dinucleotide indels, we further log the base in which they occurred - String keyStr = "RepeatExpansion_" + indelAlleleString; - int k; - for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { - if (keyStr.matches(COLUMN_KEYS[k])) - break; - } - // log now event - inds.add(k); - } - - - } - - return inds; - } - - public static String getIndelClassificationName(int k) { - if (k >=0 && k < COLUMN_KEYS.length) - return COLUMN_KEYS[k]; - else - throw new ReviewedGATKException("Invalid index when trying to get indel classification name"); - } - - public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) { - return (vc.getStart() != ref.getLocus().getStart()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java deleted file mode 100644 index a918c0a0e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java +++ /dev/null @@ -1,508 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import cern.jet.math.Arithmetic; -import cern.jet.random.Normal; -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.math.MathException; -import org.apache.commons.math.distribution.NormalDistribution; -import org.apache.commons.math.distribution.NormalDistributionImpl; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.GATKException; - -import java.io.Serializable; -import java.util.Comparator; -import java.util.TreeSet; - -/** - * Created by IntelliJ IDEA. - * User: chartl - */ -public class MannWhitneyU { - - private static Normal STANDARD_NORMAL = new Normal(0.0,1.0,null); - private static NormalDistribution APACHE_NORMAL = new NormalDistributionImpl(0.0,1.0,1e-2); - private static double LNSQRT2PI = Math.log(Math.sqrt(2.0*Math.PI)); - - private TreeSet> observations; - private int sizeSet1; - private int sizeSet2; - private ExactMode exactMode; - - public MannWhitneyU(ExactMode mode, boolean dither) { - if ( dither ) - observations = new TreeSet>(new DitheringComparator()); - else - observations = new TreeSet>(new NumberedPairComparator()); - sizeSet1 = 0; - sizeSet2 = 0; - exactMode = mode; - } - - public MannWhitneyU() { - this(ExactMode.POINT,true); - } - - public MannWhitneyU(boolean dither) { - this(ExactMode.POINT,dither); - } - - public MannWhitneyU(ExactMode mode) { - this(mode,true); - } - - /** - * Add an observation into the observation tree - * @param n: the observation (a number) - * @param set: whether the observation comes from set 1 or set 2 - */ - public void add(Number n, USet set) { - observations.add(new Pair(n,set)); - if ( set == USet.SET1 ) { - ++sizeSet1; - } else { - ++sizeSet2; - } - } - - public Pair getR1R2() { - long u1 = calculateOneSidedU(observations,MannWhitneyU.USet.SET1); - long n1 = sizeSet1*(sizeSet1+1)/2; - long r1 = u1 + n1; - long n2 = sizeSet2*(sizeSet2+1)/2; - long u2 = n1*n2-u1; - long r2 = u2 + n2; - - return new Pair(r1,r2); - } - - /** - * Runs the one-sided test under the hypothesis that the data in set "lessThanOther" stochastically - * dominates the other set - * @param lessThanOther - either Set1 or Set2 - * @return - u-based z-approximation, and p-value associated with the test (p-value is exact for small n,m) - */ - @Requires({"lessThanOther != null"}) - @Ensures({"validateObservations(observations) || Double.isNaN(result.getFirst())","result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - public Pair runOneSidedTest(USet lessThanOther) { - long u = calculateOneSidedU(observations, lessThanOther); - int n = lessThanOther == USet.SET1 ? sizeSet1 : sizeSet2; - int m = lessThanOther == USet.SET1 ? sizeSet2 : sizeSet1; - if ( n == 0 || m == 0 ) { - // test is uninformative as one or both sets have no observations - return new Pair(Double.NaN,Double.NaN); - } - - // the null hypothesis is that {N} is stochastically less than {M}, so U has counted - // occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under - // the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always. - return calculateP(n, m, u, false, exactMode); - } - - /** - * Runs the standard two-sided test, - * returns the u-based z-approximate and p values. - * @return a pair holding the u and p-value. - */ - @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - //@Requires({"validateObservations(observations)"}) - public Pair runTwoSidedTest() { - Pair uPair = calculateTwoSidedU(observations); - long u = uPair.first; - int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2; - int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1; - if ( n == 0 || m == 0 ) { - // test is uninformative as one or both sets have no observations - return new Pair(Double.NaN,Double.NaN); - } - return calculateP(n, m, u, true, exactMode); - } - - /** - * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - the Mann-Whitney U value - * @param twoSided - is the test twosided - * @return the (possibly approximate) p-value associated with the MWU test, and the (possibly approximate) z-value associated with it - * todo -- there must be an approximation for small m and large n - */ - @Requires({"m > 0","n > 0"}) - @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - protected static Pair calculateP(int n, int m, long u, boolean twoSided, ExactMode exactMode) { - Pair zandP; - if ( n > 8 && m > 8 ) { - // large m and n - normal approx - zandP = calculatePNormalApproximation(n,m,u, twoSided); - } else if ( n > 5 && m > 7 ) { - // large m, small n - sum uniform approx - // todo -- find the appropriate regimes where this approximation is actually better enough to merit slowness - // pval = calculatePUniformApproximation(n,m,u); - zandP = calculatePNormalApproximation(n, m, u, twoSided); - } else if ( n > 8 || m > 8 ) { - zandP = calculatePFromTable(n, m, u, twoSided); - } else { - // small m and n - full approx - zandP = calculatePRecursively(n,m,u,twoSided,exactMode); - } - - return zandP; - } - - public static Pair calculatePFromTable(int n, int m, long u, boolean twoSided) { - // todo -- actually use a table for: - // todo - n large, m small - return calculatePNormalApproximation(n,m,u, twoSided); - } - - /** - * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947] - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - the Mann-Whitney U value - * @param twoSided - whether the test should be two sided - * @return p-value associated with the normal approximation - */ - @Requires({"m > 0","n > 0"}) - @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - public static Pair calculatePNormalApproximation(int n,int m,long u, boolean twoSided) { - double z = getZApprox(n,m,u); - if ( twoSided ) { - return new Pair(z,2.0*(z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z))); - } else { - return new Pair(z,STANDARD_NORMAL.cdf(z)); - } - } - - /** - * Calculates the Z-score approximation of the u-statistic - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - the Mann-Whitney U value - * @return the asymptotic z-approximation corresponding to the MWU p-value for n < m - */ - @Requires({"m > 0","n > 0"}) - @Ensures({"! Double.isNaN(result)", "! Double.isInfinite(result)"}) - private static double getZApprox(int n, int m, long u) { - double mean = ( ((long)m)*n+1.0)/2; - double var = (((long) n)*m*(n+m+1.0))/12; - double z = ( u - mean )/Math.sqrt(var); - return z; - } - - /** - * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate - * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF) - * @param n - The number of entries in the stochastically smaller (dominant) set - * @param m - The number of entries in the stochastically larger (dominated) set - * @param u - mann-whitney u value - * @return p-value according to sum of uniform approx - * todo -- this is currently not called due to not having a good characterization of where it is significantly more accurate than the - * todo -- normal approxmation (e.g. enough to merit the runtime hit) - */ - public static double calculatePUniformApproximation(int n, int m, long u) { - long R = u + (n*(n+1))/2; - double a = Math.sqrt(m*(n+m+1)); - double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m)); - double z = b + ((double)R)/a; - if ( z < 0 ) { return 1.0; } - else if ( z > n ) { return 0.0; } - else { - if ( z > ((double) n) /2 ) { - return 1.0-1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); - } else { - return 1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); - } - } - } - - /** - * Helper function for the sum of n uniform random variables - * @param z - value at which to compute the (un-normalized) cdf - * @param m - a cutoff integer (defined by m <= z < m + 1) - * @param n - the number of uniform random variables - * @param k - holder variable for the recursion (alternatively, the index of the term in the sequence) - * @return the (un-normalized) cdf for the sum of n random variables - */ - private static double uniformSumHelper(double z, int m, int n, int k) { - if ( k > m ) { return 0; } - int coef = (k % 2 == 0) ? 1 : -1; - return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1); - } - - /** - * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn - * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for - * later on simply by multiplying the p-value by 2. - * - * Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen - * @param observed - the observed data - * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1) - */ - @Requires({"observed != null", "observed.size() > 0"}) - @Ensures({"result != null","result.first > 0"}) - public static Pair calculateTwoSidedU(TreeSet> observed) { - int set1SeenSoFar = 0; - int set2SeenSoFar = 0; - long uSet1DomSet2 = 0; - long uSet2DomSet1 = 0; - USet previous = null; - for ( Pair dataPoint : observed ) { - - if ( dataPoint.second == USet.SET1 ) { - ++set1SeenSoFar; - } else { - ++set2SeenSoFar; - } - - if ( previous != null ) { - if ( dataPoint.second == USet.SET1 ) { - uSet2DomSet1 += set2SeenSoFar; - } else { - uSet1DomSet2 += set1SeenSoFar; - } - } - - previous = dataPoint.second; - } - - return uSet1DomSet2 < uSet2DomSet1 ? new Pair(uSet1DomSet2,USet.SET1) : new Pair(uSet2DomSet1,USet.SET2); - } - - /** - * Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates - * the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1. - * @param observed - the observed data points, tagged by each set - * @param dominator - the set that is hypothesized to be stochastically dominating - * @return the u-statistic associated with the hypothesis that dominator stochastically dominates the other set - */ - @Requires({"observed != null","dominator != null","observed.size() > 0"}) - @Ensures({"result >= 0"}) - public static long calculateOneSidedU(TreeSet> observed,USet dominator) { - long otherBeforeDominator = 0l; - int otherSeenSoFar = 0; - for ( Pair dataPoint : observed ) { - if ( dataPoint.second != dominator ) { - ++otherSeenSoFar; - } else { - otherBeforeDominator += otherSeenSoFar; - } - } - - return otherBeforeDominator; - } - - /** - * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible - * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor - * calls into that recursive calculation. - * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) - * @param m: number of set-two entries - * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) - * @param twoSided: whether the test is two sided or not. The recursive formula is symmetric, multiply by two for two-sidedness. - * @param mode: whether the mode is a point probability, or a cumulative distribution - * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times. - */ - @Requires({"m > 0","n > 0","u >= 0"}) - @Ensures({"result != null","! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) - public static Pair calculatePRecursively(int n, int m, long u, boolean twoSided, ExactMode mode) { - if ( m > 8 && n > 5 ) { throw new GATKException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); } - double p = mode == ExactMode.POINT ? cpr(n,m,u) : cumulativeCPR(n,m,u); - //p *= twoSided ? 2.0 : 1.0; - double z; - try { - - if ( mode == ExactMode.CUMULATIVE ) { - z = APACHE_NORMAL.inverseCumulativeProbability(p); - } else { - double sd = Math.sqrt((1.0+1.0/(1+n+m))*(n*m)*(1.0+n+m)/12); // biased variance empirically better fit to distribution then asymptotic variance - //System.out.printf("SD is %f and Max is %f and prob is %f%n",sd,1.0/Math.sqrt(sd*sd*2.0*Math.PI),p); - if ( p > 1.0/Math.sqrt(sd*sd*2.0*Math.PI) ) { // possible for p-value to be outside the range of the normal. Happens at the mean, so z is 0. - z = 0.0; - } else { - if ( u >= n*m/2 ) { - z = Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI)); - } else { - z = -Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI)); - } - } - } - - } catch (MathException me) { - throw new GATKException("A math exception occurred in inverting the probability",me); - } - - return new Pair(z,(twoSided ? 2.0*p : p)); - } - - /** - * Hook into CPR with sufficient warning (for testing purposes) - * calls into that recursive calculation. - * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) - * @param m: number of set-two entries - * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) - * @return same as cpr - */ - protected static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) { - return cpr(n,m,u); - } - - /** - * For testing - * - * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) - * @param m: number of set-two entries - * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) - */ - protected static long countSequences(int n, int m, long u) { - if ( u < 0 ) { return 0; } - if ( m == 0 || n == 0 ) { return u == 0 ? 1 : 0; } - - return countSequences(n-1,m,u-m) + countSequences(n,m-1,u); - } - - /** - * : just a shorter name for calculatePRecursively. See Mann, Whitney, [1947] - * @param n: number of set-1 entries - * @param m: number of set-2 entries - * @param u: number of times a set-2 entry as preceded a set-1 entry - * @return recursive p-value - */ - private static double cpr(int n, int m, long u) { - if ( u < 0 ) { - return 0.0; - } - if ( m == 0 || n == 0 ) { - // there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero. - // note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway - return ( u == 0 ) ? 1.0 : 0.0; - } - - - return (((double)n)/(n+m))*cpr(n-1,m,u-m) + (((double)m)/(n+m))*cpr(n,m-1,u); - } - - private static double cumulativeCPR(int n, int m, long u ) { - // from above: - // the null hypothesis is that {N} is stochastically less than {M}, so U has counted - // occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under - // the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always. - double p = 0.0; - // optimization using symmetry, use the least amount of sums possible - long uSym = ( u <= n*m/2 ) ? u : ((long)n)*m-u; - for ( long uu = 0; uu < uSym; uu++ ) { - p += cpr(n,m,uu); - } - // correct by 1.0-p if the optimization above was used (e.g. 1-right tail = left tail) - return (u <= n*m/2) ? p : 1.0-p; - } - - /** - * hook into the data tree, for testing purposes only - * @return observations - */ - protected TreeSet> getObservations() { - return observations; - } - - /** - * hook into the set sizes, for testing purposes only - * @return size set 1, size set 2 - */ - protected Pair getSetSizes() { - return new Pair(sizeSet1,sizeSet2); - } - - /** - * Validates that observations are in the correct format for a MWU test -- this is only called by the contracts API during testing - * @param tree - the collection of labeled observations - * @return true iff the tree set is valid (no INFs or NaNs, at least one data point in each set) - */ - protected static boolean validateObservations(TreeSet> tree) { - boolean seen1 = false; - boolean seen2 = false; - boolean seenInvalid = false; - for ( Pair p : tree) { - if ( ! seen1 && p.getSecond() == USet.SET1 ) { - seen1 = true; - } - - if ( ! seen2 && p.getSecond() == USet.SET2 ) { - seen2 = true; - } - - if ( Double.isNaN(p.getFirst().doubleValue()) || Double.isInfinite(p.getFirst().doubleValue())) { - seenInvalid = true; - } - - } - - return ! seenInvalid && seen1 && seen2; - } - - /** - * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values - * and to ensure that rank ties are broken at random. - */ - private static class DitheringComparator implements Comparator>, Serializable { - - public DitheringComparator() {} - - @Override - public boolean equals(Object other) { return false; } - - @Override - public int compare(Pair left, Pair right) { - double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); - if ( comp > 0 ) { return 1; } - if ( comp < 0 ) { return -1; } - return GenomeAnalysisEngine.getRandomGenerator().nextBoolean() ? -1 : 1; - } - } - - /** - * A comparator that reaches into the pair and compares numbers without tie-braking. - */ - private static class NumberedPairComparator implements Comparator>, Serializable { - - public NumberedPairComparator() {} - - @Override - public boolean equals(Object other) { return false; } - - @Override - public int compare(Pair left, Pair right ) { - return Double.compare(left.first.doubleValue(),right.first.doubleValue()); - } - } - - public enum USet { SET1, SET2 } - public enum ExactMode { POINT, CUMULATIVE } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java deleted file mode 100644 index 01aa13354..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java +++ /dev/null @@ -1,1690 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.apache.commons.math.distribution.ExponentialDistribution; -import org.apache.commons.math.distribution.ExponentialDistributionImpl; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.math.BigDecimal; -import java.util.*; - -/** - * MathUtils is a static class (no instantiation allowed!) with some useful math methods. - * - * @author Kiran Garimella - */ -public class MathUtils { - - /** - * Private constructor. No instantiating this class! - */ - private MathUtils() { - } - - /** - * The smallest log10 value we'll emit from normalizeFromLog10 and other functions - * where the real-space value is 0.0. - */ - public static final double LOG10_P_OF_ZERO = -1000000.0; - public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); - public static final double LOG_ONE_HALF = -Math.log10(2.0); - public static final double LOG_ONE_THIRD = -Math.log10(3.0); - private static final double NATURAL_LOG_OF_TEN = Math.log(10.0); - private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI); - - /** - * A helper class to maintain a cache of log10 values - */ - public static class Log10Cache { - /** - * Get the value of log10(n), expanding the cache as necessary - * @param n operand - * @return log10(n) - */ - public static double get(final int n) { - if (n < 0) - throw new ReviewedGATKException(String.format("Can't take the log of a negative number: %d", n)); - if (n >= cache.length) - ensureCacheContains(Math.max(n+10, 2*cache.length)); - /* - Array lookups are not atomic. It's possible that the reference to cache could be - changed between the time the reference is loaded and the data is fetched from the correct - offset. However, the value retrieved can't change, and it's guaranteed to be present in the - old reference by the conditional above. - */ - return cache[n]; - } - - /** - * Ensures that the cache contains a value for n. After completion of ensureCacheContains(n), - * #get(n) is guaranteed to return without causing a cache expansion - * @param n desired value to be precomputed - */ - public static synchronized void ensureCacheContains(final int n) { - if (n < cache.length) - return; - final double[] newCache = new double[n + 1]; - System.arraycopy(cache, 0, newCache, 0, cache.length); - for (int i=cache.length; i < newCache.length; i++) - newCache[i] = Math.log10(i); - cache = newCache; - } - - //initialize with the special case: log10(0) = NEGATIVE_INFINITY - private static double[] cache = new double[] { Double.NEGATIVE_INFINITY }; - } - - /** - * Get a random int between min and max (inclusive) using the global GATK random number generator - * - * @param min lower bound of the range - * @param max upper bound of the range - * @return a random int >= min and <= max - */ - public static int randomIntegerInRange( final int min, final int max ) { - return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; - } - - /** - * Encapsulates the second term of Jacobian log identity for differences up to MAX_TOLERANCE - */ - private static class JacobianLogTable { - - public static final double MAX_TOLERANCE = 8.0; - - public static double get(final double difference) { - if (cache == null) - initialize(); - final int index = fastRound(difference * INV_STEP); - return cache[index]; - } - - private static synchronized void initialize() { - if (cache == null) { - final int tableSize = (int) (MAX_TOLERANCE / TABLE_STEP) + 1; - cache = new double[tableSize]; - for (int k = 0; k < cache.length; k++) - cache[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * TABLE_STEP)); - } - } - - private static final double TABLE_STEP = 0.0001; - private static final double INV_STEP = 1.0 / TABLE_STEP; - private static double[] cache = null; - } - - // A fast implementation of the Math.round() method. This method does not perform - // under/overflow checking, so this shouldn't be used in the general case (but is fine - // if one is already make those checks before calling in to the rounding). - public static int fastRound(final double d) { - return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); - } - - public static double approximateLog10SumLog10(final double[] vals) { - return approximateLog10SumLog10(vals, vals.length); - } - - /** - * Calculate the approximate log10 sum of an array range. - * @param vals the input values. - * @param fromIndex the first inclusive index in the input array. - * @param toIndex index following the last element to sum in the input array (exclusive). - * @return the approximate sum. - * @throws IllegalArgumentException if {@code vals} is {@code null} or {@code fromIndex} is out of bounds - * or if {@code toIndex} is larger than - * the length of the input array or {@code fromIndex} is larger than {@code toIndex}. - */ - public static double approximateLog10SumLog10(final double[] vals, final int fromIndex, final int toIndex) { - if (fromIndex == toIndex) return Double.NEGATIVE_INFINITY; - final int maxElementIndex = MathUtils.maxElementIndex(vals,fromIndex,toIndex); - double approxSum = vals[maxElementIndex]; - - for (int i = fromIndex; i < toIndex; i++) { - final double val; - if (i == maxElementIndex || (val = vals[i]) == Double.NEGATIVE_INFINITY) - continue; - final double diff = approxSum - val; - if (diff < JacobianLogTable.MAX_TOLERANCE) - approxSum += JacobianLogTable.get(diff); - } - return approxSum; - } - - public static double approximateLog10SumLog10(final double[] vals, final int endIndex) { - - final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); - double approxSum = vals[maxElementIndex]; - - for (int i = 0; i < endIndex; i++) { - if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) - continue; - - final double diff = approxSum - vals[i]; - if (diff < JacobianLogTable.MAX_TOLERANCE) { - // See notes from the 2-inout implementation below - approxSum += JacobianLogTable.get(diff); - } - } - - return approxSum; - } - - public static double approximateLog10SumLog10(final double a, final double b, final double c) { - return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); - } - - public static double approximateLog10SumLog10(double small, double big) { - // make sure small is really the smaller value - if (small > big) { - final double t = big; - big = small; - small = t; - } - - if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY) - return big; - - final double diff = big - small; - if (diff >= JacobianLogTable.MAX_TOLERANCE) - return big; - - // OK, so |y-x| < tol: we use the following identity then: - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup with integer quantization - // we have pre-stored correction for 0,0.1,0.2,... 10.0 - return big + JacobianLogTable.get(diff); - } - - public static double sum(final double[] values) { - double s = 0.0; - for (double v : values) - s += v; - return s; - } - - public static long sum(final int[] x) { - long total = 0; - for (int v : x) - total += v; - return total; - } - - public static int sum(final byte[] x) { - int total = 0; - for (byte v : x) - total += (int)v; - return total; - } - - public static double percentage(int x, int base) { - return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); - } - - public static double ratio(final int num, final int denom) { - if ( denom > 0 ) { - return ((double) num)/denom; - } else { - if ( num == 0 && denom == 0) { - return 0.0; - } else { - throw new ReviewedGATKException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); - } - } - } - - public static double ratio(final long num, final long denom) { - if ( denom > 0L ) { - return ((double) num)/denom; - } else { - if ( num == 0L && denom == 0L ) { - return 0.0; - } else { - throw new ReviewedGATKException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); - } - } - } - - /** - * Converts a real space array of numbers (typically probabilities) into a log10 array - * - * @param prRealSpace - * @return - */ - public static double[] toLog10(final double[] prRealSpace) { - double[] log10s = new double[prRealSpace.length]; - for (int i = 0; i < prRealSpace.length; i++) { - log10s[i] = Math.log10(prRealSpace[i]); - } - return log10s; - } - - public static double log10sumLog10(final double[] log10p, final int start) { - return log10sumLog10(log10p, start, log10p.length); - } - - public static double log10sumLog10(final double[] log10p, final int start, final int finish) { - - if (start >= finish) - return Double.NEGATIVE_INFINITY; - final int maxElementIndex = MathUtils.maxElementIndex(log10p, start, finish); - final double maxValue = log10p[maxElementIndex]; - if(maxValue == Double.NEGATIVE_INFINITY) - return maxValue; - double sum = 1.0; - for (int i = start; i < finish; i++) { - double curVal = log10p[i]; - double scaled_val = curVal - maxValue; - if (i == maxElementIndex || curVal == Double.NEGATIVE_INFINITY) { - continue; - } - else { - sum += Math.pow(10.0, scaled_val); - } - } - if ( Double.isNaN(sum) || sum == Double.POSITIVE_INFINITY ) { - throw new IllegalArgumentException("log10p: Values must be non-infinite and non-NAN"); - } - return maxValue + (sum != 1.0 ? Math.log10(sum) : 0.0); - } - - public static double sumLog10(final double[] log10values) { - return Math.pow(10.0, log10sumLog10(log10values)); - } - - public static double log10sumLog10(final double[] log10values) { - return log10sumLog10(log10values, 0); - } - - public static boolean wellFormedDouble(final double val) { - return !Double.isInfinite(val) && !Double.isNaN(val); - } - - public static double bound(final double value, final double minBoundary, final double maxBoundary) { - return Math.max(Math.min(value, maxBoundary), minBoundary); - } - - public static boolean isBounded(final double val, final double lower, final double upper) { - return val >= lower && val <= upper; - } - - public static boolean isPositive(final double val) { - return !isNegativeOrZero(val); - } - - public static boolean isPositiveOrZero(final double val) { - return isBounded(val, 0.0, Double.POSITIVE_INFINITY); - } - - public static boolean isNegativeOrZero(final double val) { - return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); - } - - public static boolean isNegative(final double val) { - return !isPositiveOrZero(val); - } - - /** - * Compares double values for equality (within 1e-6), or inequality. - * - * @param a the first double value - * @param b the second double value - * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. - */ - public static byte compareDoubles(final double a, final double b) { - return compareDoubles(a, b, 1e-6); - } - - /** - * Compares double values for equality (within epsilon), or inequality. - * - * @param a the first double value - * @param b the second double value - * @param epsilon the precision within which two double values will be considered equal - * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. - */ - public static byte compareDoubles(final double a, final double b, final double epsilon) { - if (Math.abs(a - b) < epsilon) { - return 0; - } - if (a > b) { - return -1; - } - return 1; - } - - /** - * Calculate f(x) = Normal(x | mu = mean, sigma = sd) - * @param mean the desired mean of the Normal distribution - * @param sd the desired standard deviation of the Normal distribution - * @param x the value to evaluate - * @return a well-formed double - */ - public static double normalDistribution(final double mean, final double sd, final double x) { - if( sd < 0 ) - throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); - if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) - throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); - double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); - double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); - return a * b; - } - - /** - * Calculate f(x) = log10 ( Normal(x | mu = mean, sigma = sd) ) - * @param mean the desired mean of the Normal distribution - * @param sd the desired standard deviation of the Normal distribution - * @param x the value to evaluate - * @return a well-formed double - */ - - public static double normalDistributionLog10(final double mean, final double sd, final double x) { - if( sd < 0 ) - throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); - if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) - throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); - final double a = -1.0 * Math.log10(sd * SQUARE_ROOT_OF_TWO_TIMES_PI); - final double b = -1.0 * (square(x - mean) / (2.0 * square(sd))) / NATURAL_LOG_OF_TEN; - return a + b; - } - - /** - * Calculate f(x) = x^2 - * @param x the value to square - * @return x * x - */ - public static double square(final double x) { - return x * x; - } - - /** - * Calculates the log10 of the binomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k number of successes - * @return the log10 of the binomial coefficient - */ - public static double binomialCoefficient(final int n, final int k) { - return Math.pow(10, log10BinomialCoefficient(n, k)); - } - - /** - * @see #binomialCoefficient(int, int) with log10 applied to result - */ - public static double log10BinomialCoefficient(final int n, final int k) { - if ( n < 0 ) { - throw new IllegalArgumentException("n: Must have non-negative number of trials"); - } - if ( k > n || k < 0 ) { - throw new IllegalArgumentException("k: Must have non-negative number of successes, and no more successes than number of trials"); - } - - return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); - } - - /** - * Computes a binomial probability. This is computed using the formula - *

- * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) - *

- * where n is the number of trials, k is the number of successes, and p is the probability of success - * - * @param n number of Bernoulli trials - * @param k number of successes - * @param p probability of success - * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. - */ - public static double binomialProbability(final int n, final int k, final double p) { - return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); - } - - /** - * @see #binomialProbability(int, int, double) with log10 applied to result - */ - public static double log10BinomialProbability(final int n, final int k, final double log10p) { - if ( log10p > 1e-18 ) - throw new IllegalArgumentException("log10p: Log-probability must be 0 or less"); - double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); - return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); - } - - /** - * @see #binomialProbability(int, int, double) with p=0.5 - */ - public static double binomialProbability(final int n, final int k) { - return Math.pow(10, log10BinomialProbability(n, k)); - } - - /** - * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result - */ - public static double log10BinomialProbability(final int n, final int k) { - return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); - } - - /** A memoization container for {@link #binomialCumulativeProbability(int, int, int)}. Synchronized to accomodate multithreading. */ - private static final Map BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE = - Collections.synchronizedMap(new LRUCache(10_000)); - - /** - * Primitive integer-triplet bijection into long. Returns null when the bijection function fails (in lieu of an exception), which will - * happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a - * utility function. - */ - static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) { - if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) { - return null; - } else { - long result = 0; - result += (short) one; - result <<= 16; - result += (short) two; - result <<= 16; - result += (short) three; - return result; - } - } - - /** - * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. - * Assumes that the probability of a successful hit is fair (i.e. 0.5). - * - * This pure function is memoized because of its expensive BigDecimal calculations. - * - * @param n number of attempts for the number of hits - * @param k_start start (inclusive) of the cumulant sum (over hits) - * @param k_end end (inclusive) of the cumulant sum (over hits) - * @return - returns the cumulative probability - */ - public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { - if ( k_end > n ) - throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); - - // Fetch cached value, if applicable. - final Long memoizationKey = fastGenerateUniqueHashFromThreeIntegers(n, k_start, k_end); - final Double memoizationCacheResult; - if (memoizationKey != null) { - memoizationCacheResult = BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.get(memoizationKey); - } else { - memoizationCacheResult = null; - } - - final double result; - if (memoizationCacheResult != null) { - result = memoizationCacheResult; - } else { - double cumProb = 0.0; - double prevProb; - BigDecimal probCache = BigDecimal.ZERO; - - for (int hits = k_start; hits <= k_end; hits++) { - prevProb = cumProb; - final double probability = binomialProbability(n, hits); - cumProb += probability; - if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision - probCache = probCache.add(new BigDecimal(prevProb)); - cumProb = 0.0; - hits--; // repeat loop - // prevProb changes at start of loop - } - } - - result = probCache.add(new BigDecimal(cumProb)).doubleValue(); - if (memoizationKey != null) { - BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.put(memoizationKey, result); - } - } - return result; - } - - private static final double LOG1MEXP_THRESHOLD = Math.log(0.5); - - private static final double LN_10 = Math.log(10); - - /** - * Calculates {@code log(1-exp(a))} without loosing precision. - * - *

- * This is based on the approach described in: - * - *

- *

- * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
- *
Online document. - * - *

- * - * @param a the input exponent. - * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. - */ - public static double log1mexp(final double a) { - if (a > 0) return Double.NaN; - if (a == 0) return Double.NEGATIVE_INFINITY; - - return (a < LOG1MEXP_THRESHOLD) ? Math.log1p(-Math.exp(a)) : Math.log(-Math.expm1(a)); - } - - /** - * Calculates {@code log10(1-10^a)} without loosing precision. - * - *

- * This is based on the approach described in: - * - *

- *

- * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
- * Online document. - *

- * - * @param a the input exponent. - * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. - */ - public static double log10OneMinusPow10(final double a) { - if (a > 0) return Double.NaN; - if (a == 0) return Double.NEGATIVE_INFINITY; - final double b = a * LN_10; - return log1mexp(b) / LN_10; - } - - /** - * Calculates the log10 of the multinomial coefficient. Designed to prevent - * overflows even with very large numbers. - * - * @param n total number of trials - * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) - * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. - */ - public static double log10MultinomialCoefficient(final int n, final int[] k) { - if ( n < 0 ) - throw new IllegalArgumentException("n: Must have non-negative number of trials"); - double denominator = 0.0; - int sum = 0; - for (int x : k) { - if ( x < 0 ) - throw new IllegalArgumentException("x element of k: Must have non-negative observations of group"); - if ( x > n ) - throw new IllegalArgumentException("x element of k, n: Group observations must be bounded by k"); - denominator += log10Factorial(x); - sum += x; - } - if ( sum != n ) - throw new IllegalArgumentException("k and n: Sum of observations in multinomial must sum to total number of trials"); - return log10Factorial(n) - denominator; - } - - /** - * Computes the log10 of the multinomial distribution probability given a vector - * of log10 probabilities. Designed to prevent overflows even with very large numbers. - * - * @param n number of trials - * @param k array of number of successes for each possibility - * @param log10p array of log10 probabilities - * @return - */ - public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { - if (log10p.length != k.length) - throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); - double log10Prod = 0.0; - for (int i = 0; i < log10p.length; i++) { - if ( log10p[i] > 1e-18 ) - throw new IllegalArgumentException("log10p: Log-probability must be <= 0"); - log10Prod += log10p[i] * k[i]; - } - return log10MultinomialCoefficient(n, k) + log10Prod; - } - - /** - * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. - * This is computed using the formula: - *

- * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] - *

- * where xi represents the number of times outcome i was observed, n is the number of total observations. - * In this implementation, the value of n is inferred as the sum over i of xi. - * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @return the multinomial of the specified configuration. - */ - public static double multinomialCoefficient(final int[] k) { - int n = 0; - for (int xi : k) { - n += xi; - } - - return Math.pow(10, log10MultinomialCoefficient(n, k)); - } - - /** - * Computes a multinomial probability efficiently avoiding overflow even for large numbers. - * This is computed using the formula: - *

- * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) - *

- * where xi represents the number of times outcome i was observed, n is the number of total observations, and - * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is - * inferred as the sum over i of xi. - * - * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed - * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur - * @return the multinomial probability of the specified configuration. - */ - public static double multinomialProbability(final int[] k, final double[] p) { - if (p.length != k.length) - throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); - - int n = 0; - double[] log10P = new double[p.length]; - for (int i = 0; i < p.length; i++) { - log10P[i] = Math.log10(p[i]); - n += k[i]; - } - return Math.pow(10, log10MultinomialProbability(n, k, log10P)); - } - - /** - * calculate the Root Mean Square of an array of integers - * - * @param x an byte[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final byte[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (int i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - /** - * calculate the Root Mean Square of an array of integers - * - * @param x an int[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final int[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (int i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - /** - * calculate the Root Mean Square of an array of doubles - * - * @param x a double[] of numbers - * @return the RMS of the specified numbers. - */ - public static double rms(final Double[] x) { - if (x.length == 0) - return 0.0; - - double rms = 0.0; - for (Double i : x) - rms += i * i; - rms /= x.length; - return Math.sqrt(rms); - } - - public static double rms(final Collection l) { - if (l.size() == 0) - return 0.0; - - double rms = 0.0; - for (int i : l) - rms += i * i; - rms /= l.size(); - return Math.sqrt(rms); - } - - public static double distanceSquared(final double[] x, final double[] y) { - double dist = 0.0; - for (int iii = 0; iii < x.length; iii++) { - dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); - } - return dist; - } - - public static double round(final double num, final int digits) { - double result = num * Math.pow(10.0, (double) digits); - result = Math.round(result); - result = result / Math.pow(10.0, (double) digits); - return result; - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @param takeLog10OfOutput if true, the output will be transformed back into log10 units - * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed - */ - public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { - return normalizeFromLog10(array, takeLog10OfOutput, false); - } - - /** - * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space - * - * @param array - * @param takeLog10OfOutput - * @param keepInLogSpace - * - * @return - */ - public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = arrayMax(array); - - // we may decide to just normalize in log space without converting to linear space - if (keepInLogSpace) { - for (int i = 0; i < array.length; i++) { - array[i] -= maxValue; - } - return array; - } - - // default case: go to linear space - double[] normalized = new double[array.length]; - - for (int i = 0; i < array.length; i++) - normalized[i] = Math.pow(10, array[i] - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.length; i++) - sum += normalized[i]; - for (int i = 0; i < array.length; i++) { - double x = normalized[i] / sum; - if (takeLog10OfOutput) { - x = Math.log10(x); - if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) - x = array[i] - maxValue; - } - - normalized[i] = x; - } - - return normalized; - } - - /** - * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). - * - * @param array the array to be normalized - * @return a newly allocated array corresponding the normalized values in array - */ - public static double[] normalizeFromLog10(final double[] array) { - return normalizeFromLog10(array, false); - } - - /** - * normalizes the real-space probability array. - * - * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok - * to have values in the array of > 1, or have the sum go above 0. - * - * @param array the array to be normalized - * @return a newly allocated array corresponding the normalized values in array - */ - @Requires("array != null") - @Ensures({"result != null"}) - public static double[] normalizeFromRealSpace(final double[] array) { - if ( array.length == 0 ) - return array; - - final double sum = sum(array); - final double[] normalized = new double[array.length]; - if ( sum < 0.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); - for ( int i = 0; i < array.length; i++ ) { - normalized[i] = array[i] / sum; - } - return normalized; - } - - public static int maxElementIndex(final double[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final double[] array, final int start, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - if (start > endIndex) { - throw new IllegalArgumentException("Start cannot be after end."); - } - - int maxI = start; - for (int i = (start+1); i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - return maxI; - } - - public static int maxElementIndex(final double[] array, final int endIndex) { - return maxElementIndex(array, 0, endIndex); - } - - public static int maxElementIndex(final int[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final byte[] array) { - return maxElementIndex(array, array.length); - } - - public static int maxElementIndex(final int[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - return maxI; - } - - public static int maxElementIndex(final byte[] array, final int endIndex) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int maxI = 0; - for (int i = 1; i < endIndex; i++) { - if (array[i] > array[maxI]) - maxI = i; - } - - return maxI; - } - - public static int arrayMax(final int[] array) { - return array[maxElementIndex(array)]; - } - - - public static double arrayMax(final double[] array) { - return array[maxElementIndex(array)]; - } - - public static double arrayMax(final double[] array, final int endIndex) { - return array[maxElementIndex(array, endIndex)]; - } - - public static double arrayMin(final double[] array) { - return array[minElementIndex(array)]; - } - - public static int arrayMin(final int[] array) { - return array[minElementIndex(array)]; - } - - public static byte arrayMin(final byte[] array) { - return array[minElementIndex(array)]; - } - - /** - * Compute the min element of a List - * @param array a non-empty list of integer - * @return the min - */ - public static int arrayMin(final List array) { - if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty"); - int min = array.get(0); - for ( final int i : array ) - if ( i < min ) min = i; - return min; - } - - /** - * Compute the median element of the list of integers - * @param array a list of integers - * @return the median element - */ - public static > T median(final List array) { - /* TODO -- from Valentin - the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). - - But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] - - My suggestion for a solution is then: - - unify median and medianDoubles to public static T median(Collection) - check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. - relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) - In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) - */ - if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); - final int size = array.size(); - if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); - else if ( size == 1 ) return array.get(0); - else { - final ArrayList sorted = new ArrayList<>(array); - Collections.sort(sorted); - return sorted.get(size / 2); - } - } - - public static int minElementIndex(final double[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int minElementIndex(final byte[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int minElementIndex(final int[] array) { - if (array == null || array.length == 0) - throw new IllegalArgumentException("Array cannot be null!"); - - int minI = 0; - for (int i = 1; i < array.length; i++) { - if (array[i] < array[minI]) - minI = i; - } - - return minI; - } - - public static int arrayMaxInt(final List array) { - if (array == null) - throw new IllegalArgumentException("Array cannot be null!"); - if (array.size() == 0) - throw new IllegalArgumentException("Array size cannot be 0!"); - - int m = array.get(0); - for (int e : array) - m = Math.max(m, e); - return m; - } - - public static int sum(final List list ) { - int sum = 0; - for ( Integer i : list ) { - sum += i; - } - return sum; - } - - public static double average(final List vals, final int maxI) { - long sum = 0L; - - int i = 0; - for (long x : vals) { - if (i > maxI) - break; - sum += x; - i++; - } - - return (1.0 * sum) / i; - } - - public static double average(final List vals) { - return average(vals, vals.size()); - } - - public static int countOccurrences(final char c, final String s) { - int count = 0; - for (int i = 0; i < s.length(); i++) { - count += s.charAt(i) == c ? 1 : 0; - } - return count; - } - - public static int countOccurrences(T x, List l) { - int count = 0; - for (T y : l) { - if (x.equals(y)) - count++; - } - - return count; - } - - public static int countOccurrences(byte element, byte[] array) { - int count = 0; - for (byte y : array) { - if (element == y) - count++; - } - - return count; - } - - public static int countOccurrences(final boolean element, final boolean[] array) { - int count = 0; - for (final boolean b : array) { - if (element == b) - count++; - } - - return count; - } - - - /** - * Returns n random indices drawn with replacement from the range 0..(k-1) - * - * @param n the total number of indices sampled from - * @param k the number of random indices to draw (with replacement) - * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates - */ - static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { - - ArrayList chosen_balls = new ArrayList(k); - for (int i = 0; i < k; i++) { - //Integer chosen_ball = balls[rand.nextInt(k)]; - chosen_balls.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(n)); - //balls.remove(chosen_ball); - } - - return chosen_balls; - } - - /** - * Returns n random indices drawn without replacement from the range 0..(k-1) - * - * @param n the total number of indices sampled from - * @param k the number of random indices to draw (without replacement) - * @return a list of k random indices ranging from 0 to (n-1) without duplicates - */ - static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { - ArrayList chosen_balls = new ArrayList(k); - - for (int i = 0; i < n; i++) { - chosen_balls.add(i); - } - - Collections.shuffle(chosen_balls, GenomeAnalysisEngine.getRandomGenerator()); - - //return (ArrayList) chosen_balls.subList(0, k); - return new ArrayList(chosen_balls.subList(0, k)); - } - - /** - * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times - * - * @param indices the list of indices for elements to extract - * @param list the list from which the elements should be extracted - * @param the template type of the ArrayList - * @return a new ArrayList consisting of the elements at the specified indices - */ - static public ArrayList sliceListByIndices(final List indices, final List list) { - ArrayList subset = new ArrayList(); - - for (int i : indices) { - subset.add(list.get(i)); - } - - return subset; - } - - /** - * Given two log-probability vectors, compute log of vector product of them: - * in Matlab notation, return log10(10.*x'*10.^y) - * @param x vector 1 - * @param y vector 2 - * @return a double representing log (dotProd(10.^x,10.^y) - */ - public static double logDotProduct(final double [] x, final double[] y) { - if (x.length != y.length) - throw new ReviewedGATKException("BUG: Vectors of different lengths"); - - double tmpVec[] = new double[x.length]; - - for (int k=0; k < tmpVec.length; k++ ) { - tmpVec[k] = x[k]+y[k]; - } - - return log10sumLog10(tmpVec); - - - - } - - /** - * Check that the log10 prob vector vector is well formed - * - * @param vector - * @param expectedSize - * @param shouldSumToOne - * - * @return true if vector is well-formed, false otherwise - */ - public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { - if ( vector.length != expectedSize ) return false; - - for ( final double pr : vector ) { - if ( ! goodLog10Probability(pr) ) - return false; - } - - if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) - return false; - - return true; // everything is good - } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value. By default allows - * -Infinity values, as log10(0.0) == -Infinity. - * @return true if result is really well formed - */ - public static boolean goodLog10Probability(final double result) { - return goodLog10Probability(result, true); - } - - /** - * Checks that the result is a well-formed log10 probability - * - * @param result a supposedly well-formed log10 probability value - * @param allowNegativeInfinity should we consider a -Infinity value ok? - * @return true if result is really well formed - */ - public static boolean goodLog10Probability(final double result, final boolean allowNegativeInfinity) { - return result <= 0.0 && result != Double.POSITIVE_INFINITY && (allowNegativeInfinity || result != Double.NEGATIVE_INFINITY) && ! Double.isNaN(result); - } - - /** - * Checks that the result is a well-formed probability - * - * @param result a supposedly well-formed probability value - * @return true if result is really well formed - */ - public static boolean goodProbability(final double result) { - return result >= 0.0 && result <= 1.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); - } - - /** - * A utility class that computes on the fly average and standard deviation for a stream of numbers. - * The number of observations does not have to be known in advance, and can be also very big (so that - * it could overflow any naive summation-based scheme or cause loss of precision). - * Instead, adding a new number observed - * to a sample with add(observed) immediately updates the instance of this object so that - * it contains correct mean and standard deviation for all the numbers seen so far. Source: Knuth, vol.2 - * (see also e.g. http://www.johndcook.com/standard_deviation.html for online reference). - */ - public static class RunningAverage { - private double mean = 0.0; - private double s = 0.0; - private long obs_count = 0; - - public void add(double obs) { - obs_count++; - double oldMean = mean; - mean += (obs - mean) / obs_count; // update mean - s += (obs - oldMean) * (obs - mean); - } - - public void addAll(Collection col) { - for (Number o : col) { - add(o.doubleValue()); - } - } - - public double mean() { - return mean; - } - - public double stddev() { - return Math.sqrt(s / (obs_count - 1)); - } - - public double var() { - return s / (obs_count - 1); - } - - public long observationCount() { - return obs_count; - } - - public RunningAverage clone() { - RunningAverage ra = new RunningAverage(); - ra.mean = this.mean; - ra.s = this.s; - ra.obs_count = this.obs_count; - return ra; - } - - public void merge(RunningAverage other) { - if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all - this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); - this.s += other.s; - } - this.obs_count += other.obs_count; - } - } - - // - // useful common utility routines - // - - static public double max(double x0, double x1, double x2) { - double a = Math.max(x0, x1); - return Math.max(a, x2); - } - - /** - * Converts LN to LOG10 - * - * @param ln log(x) - * @return log10(x) - */ - public static double lnToLog10(final double ln) { - return ln * Math.log10(Math.E); - } - - /** - * Constants to simplify the log gamma function calculation. - */ - private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03; - - /** - * Efficient rounding functions to simplify the log gamma function calculation - * double to long with 32 bit shift - */ - private static final int HI(final double x) { - return (int) (Double.doubleToLongBits(x) >> 32); - } - - /** - * Efficient rounding functions to simplify the log gamma function calculation - * double to long without shift - */ - private static final int LO(final double x) { - return (int) Double.doubleToLongBits(x); - } - - /** - * Most efficent implementation of the lnGamma (FDLIBM) - * Use via the log10Gamma wrapper method. - */ - private static double lnGamma(final double x) { - double t, y, z, p, p1, p2, p3, q, r, w; - int i; - - int hx = HI(x); - int lx = LO(x); - - /* purge off +-inf, NaN, +-0, and negative arguments */ - int ix = hx & 0x7fffffff; - if (ix >= 0x7ff00000) - return Double.POSITIVE_INFINITY; - if ((ix | lx) == 0 || hx < 0) - return Double.NaN; - if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ - return -Math.log(x); - } - - /* purge off 1 and 2 */ - if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) - r = 0; - /* for x < 2.0 */ - else if (ix < 0x40000000) { - if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ - r = -Math.log(x); - if (ix >= 0x3FE76944) { - y = one - x; - i = 0; - } - else if (ix >= 0x3FCDA661) { - y = x - (tc - one); - i = 1; - } - else { - y = x; - i = 2; - } - } - else { - r = zero; - if (ix >= 0x3FFBB4C3) { - y = 2.0 - x; - i = 0; - } /* [1.7316,2] */ - else if (ix >= 0x3FF3B4C4) { - y = x - tc; - i = 1; - } /* [1.23,1.73] */ - else { - y = x - one; - i = 2; - } - } - - switch (i) { - case 0: - z = y * y; - p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); - p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); - p = y * p1 + p2; - r += (p - 0.5 * y); - break; - case 1: - z = y * y; - w = z * y; - p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ - p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); - p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); - p = z * p1 - (tt - w * (p2 + y * p3)); - r += (tf + p); - break; - case 2: - p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); - p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); - r += (-0.5 * y + p1 / p2); - } - } - else if (ix < 0x40200000) { /* x < 8.0 */ - i = (int) x; - t = zero; - y = x - (double) i; - p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); - q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); - r = half * y + p / q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch (i) { - case 7: - z *= (y + 6.0); /* FALLTHRU */ - case 6: - z *= (y + 5.0); /* FALLTHRU */ - case 5: - z *= (y + 4.0); /* FALLTHRU */ - case 4: - z *= (y + 3.0); /* FALLTHRU */ - case 3: - z *= (y + 2.0); /* FALLTHRU */ - r += Math.log(z); - break; - } - /* 8.0 <= x < 2**58 */ - } - else if (ix < 0x43900000) { - t = Math.log(x); - z = one / x; - y = z * z; - w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); - r = (x - half) * (t - one) + w; - } - else - /* 2**58 <= x <= inf */ - r = x * (Math.log(x) - one); - return r; - } - - /** - * Calculates the log10 of the gamma function for x using the efficient FDLIBM - * implementation to avoid overflows and guarantees high accuracy even for large - * numbers. - * - * @param x the x parameter - * @return the log10 of the gamma function at x. - */ - public static double log10Gamma(final double x) { - return lnToLog10(lnGamma(x)); - } - - public static double factorial(final int x) { - // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value - return (double)Math.round(Math.pow(10, log10Factorial(x))); - } - - public static double log10Factorial(final int x) { - if (x >= Log10FactorialCache.size() || x < 0) - return log10Gamma(x + 1); - else - return Log10FactorialCache.get(x); - } - - /** - * Wrapper class so that the log10Factorial array is only calculated if it's used - */ - private static class Log10FactorialCache { - - /** - * The size of the precomputed cache. Must be a positive number! - */ - private static final int CACHE_SIZE = 10_000; - - public static int size() { return CACHE_SIZE; } - - public static double get(final int n) { - if (cache == null) - initialize(); - return cache[n]; - } - - private static synchronized void initialize() { - if (cache == null) { - Log10Cache.ensureCacheContains(CACHE_SIZE); - cache = new double[CACHE_SIZE]; - cache[0] = 0.0; - for (int k = 1; k < cache.length; k++) - cache[k] = cache[k-1] + Log10Cache.get(k); - } - } - - private static double[] cache = null; - } - - /** - * Adds two arrays together and returns a new array with the sum. - * - * @param a one array - * @param b another array - * @return a new array with the sum of a and b - */ - @Requires("a.length == b.length") - @Ensures("result.length == a.length") - public static int[] addArrays(final int[] a, final int[] b) { - int[] c = new int[a.length]; - for (int i = 0; i < a.length; i++) - c[i] = a[i] + b[i]; - return c; - } - - /** Same routine, unboxed types for efficiency - * - * @param x First vector - * @param y Second vector - * @return Vector of same length as x and y so that z[k] = x[k]+y[k] - */ - public static double[] vectorSum(final double[]x, final double[] y) { - if (x.length != y.length) - throw new ReviewedGATKException("BUG: Lengths of x and y must be the same"); - - double[] result = new double[x.length]; - for (int k=0; k log10LinearRange(final int start, final int stop, final double eps) { - final LinkedList values = new LinkedList<>(); - final double log10range = Math.log10(stop - start); - - if ( start == 0 ) - values.add(0); - - double i = 0.0; - while ( i <= log10range ) { - final int index = (int)Math.round(Math.pow(10, i)) + start; - if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) - values.add(index); - i += eps; - } - - if ( values.peekLast() == null || values.peekLast() != stop ) - values.add(stop); - - return values; - } - - /** - * Compute in a numerical correct way the quantity log10(1-x) - * - * Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow - * in 1-x when x is very small - * - * @param x a positive double value between 0.0 and 1.0 - * @return an estimate of log10(1-x) - */ - @Requires("x >= 0.0 && x <= 1.0") - @Ensures("result <= 0.0") - public static double log10OneMinusX(final double x) { - if ( x == 1.0 ) - return Double.NEGATIVE_INFINITY; - else if ( x == 0.0 ) - return 0.0; - else { - final double d = Math.log10(1 / x - 1) + Math.log10(x); - return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; - } - } - - /** - * Draw N random elements from list - * @param list - the list from which to draw randomly - * @param N - the number of elements to draw - */ - public static List randomSubset(final List list, final int N) { - if (list.size() <= N) { - return list; - } - - return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); - } - - /** - * Draw N random elements from list with replacement - * @param list - the list from which to draw randomly - * @param N - the number of elements to draw - */ - public static List randomSample(final List list, final int N) { - if (list.isEmpty() ) { - return list; - } - return sliceListByIndices(sampleIndicesWithReplacement(list.size(),N),list); - } - - /** - * Return the likelihood of observing the counts of categories having sampled a population - * whose categorial frequencies are distributed according to a Dirichlet distribution - * @param dirichletParams - params of the prior dirichlet distribution - * @param dirichletSum - the sum of those parameters - * @param counts - the counts of observation in each category - * @param countSum - the sum of counts (number of trials) - * @return - associated likelihood - */ - public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum, - final int[] counts, final int countSum) { - if ( dirichletParams.length != counts.length ) { - throw new IllegalStateException("The number of dirichlet parameters must match the number of categories"); - } - // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1) - double likelihood = log10MultinomialCoefficient(countSum,counts); - likelihood += log10Gamma(dirichletSum); - likelihood -= log10Gamma(dirichletSum+countSum); - for ( int idx = 0; idx < counts.length; idx++ ) { - likelihood += log10Gamma(counts[idx] + dirichletParams[idx]); - likelihood -= log10Gamma(dirichletParams[idx]); - } - - return likelihood; - } - - public static double dirichletMultinomial(double[] params, int[] counts) { - return dirichletMultinomial(params,sum(params),counts,(int) sum(counts)); - } - - public static ExponentialDistribution exponentialDistribution( final double mean ) { - return new ExponentialDistributionImpl(mean); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MendelianViolation.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MendelianViolation.java deleted file mode 100644 index 75666a7e8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MendelianViolation.java +++ /dev/null @@ -1,460 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import org.broadinstitute.gatk.engine.samples.Sample; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeType; -import htsjdk.variant.variantcontext.VariantContext; - -import java.util.*; - -/** - * User: carneiro / lfran - * Date: 3/9/11 - * Time: 12:38 PM - * - * Class for the identification and tracking of mendelian violation. It can be used in 2 distinct ways: - * - Either using an instance of the MendelianViolation class to track mendelian violations for each of the families while - * walking over the variants - * - Or using the static methods to directly get information about mendelian violation in a family at a given locus - * - */ -public class MendelianViolation { - //List of families with violations - private List violationFamilies; - - //Call information - private int nocall = 0; - private int familyCalled = 0; - private int varFamilyCalled = 0; - private int lowQual = 0; - - private boolean allCalledOnly = true; - - //Stores occurrences of inheritance - private EnumMap>> inheritance; - - private int violations_total=0; - - private double minGenotypeQuality; - - private boolean abortOnSampleNotFound; - - //Number of families with genotype information for all members - public int getFamilyCalledCount(){ - return familyCalled; - } - - //Number of families with genotype information for all members - public int getVarFamilyCalledCount(){ - return varFamilyCalled; - } - - //Number of families missing genotypes for one or more of their members - public int getFamilyNoCallCount(){ - return nocall; - } - - //Number of families with genotypes below the set quality threshold - public int getFamilyLowQualsCount(){ - return lowQual; - } - - public int getViolationsCount(){ - return violations_total; - } - - //Count of alt alleles inherited from het parents (no violation) - public int getParentHetInheritedVar(){ - return getParentsHetHetInheritedVar() + getParentsRefHetInheritedVar() + getParentsVarHetInheritedVar(); - } - - //Count of ref alleles inherited from het parents (no violation) - public int getParentHetInheritedRef(){ - return getParentsHetHetInheritedRef() + getParentsRefHetInheritedRef() + getParentsVarHetInheritedRef(); - } - - //Count of HomRef/HomRef/HomRef trios - public int getRefRefRef(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); - } - - //Count of HomVar/HomVar/HomVar trios - public int getVarVarVar(){ - return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR); - } - - //Count of HomRef/HomVar/Het trios - public int getRefVarHet(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HET) + - inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HET); - } - - //Count of Het/Het/Het trios - public int getHetHetHet(){ - return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET); - } - - //Count of Het/Het/HomRef trios - public int getHetHetHomRef(){ - return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF); - } - - //Count of Het/Het/HomVar trios - public int getHetHetHomVar(){ - return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR); - } - - //Count of ref alleles inherited from Het/Het parents (no violation) - public int getParentsHetHetInheritedRef(){ - return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET) - + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_REF); - //return parentsHetHet_childRef; - } - - //Count of var alleles inherited from Het/Het parents (no violation) - public int getParentsHetHetInheritedVar(){ - return inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HET) - + 2*inheritance.get(GenotypeType.HET).get(GenotypeType.HET).get(GenotypeType.HOM_VAR); - //return parentsHetHet_childVar; - } - - //Count of ref alleles inherited from HomRef/Het parents (no violation) - public int getParentsRefHetInheritedRef(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_REF) - + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); - //return parentsHomRefHet_childRef; - } - - //Count of var alleles inherited from HomRef/Het parents (no violation) - public int getParentsRefHetInheritedVar(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HET) - + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HET); - //return parentsHomRefHet_childVar; - } - - //Count of ref alleles inherited from HomVar/Het parents (no violation) - public int getParentsVarHetInheritedRef(){ - return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HET) - + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HET); - //return parentsHomVarHet_childRef; - } - - //Count of var alleles inherited from HomVar/Het parents (no violation) - public int getParentsVarHetInheritedVar(){ - return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_VAR) - + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR); - //return parentsHomVarHet_childVar; - } - - //Count of violations of the type HOM_REF/HOM_REF -> HOM_VAR - public int getParentsRefRefChildVar(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); - } - - //Count of violations of the type HOM_REF/HOM_REF -> HET - public int getParentsRefRefChildHet(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF).get(GenotypeType.HET); - } - - //Count of violations of the type HOM_REF/HET -> HOM_VAR - public int getParentsRefHetChildVar(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HET).get(GenotypeType.HOM_VAR) - + inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); - } - - //Count of violations of the type HOM_REF/HOM_VAR -> HOM_VAR - public int getParentsRefVarChildVar(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR) - + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR); - } - - //Count of violations of the type HOM_REF/HOM_VAR -> HOM_REF - public int getParentsRefVarChildRef(){ - return inheritance.get(GenotypeType.HOM_REF).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF) - + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF).get(GenotypeType.HOM_REF); - } - - //Count of violations of the type HOM_VAR/HET -> HOM_REF - public int getParentsVarHetChildRef(){ - return inheritance.get(GenotypeType.HET).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF) - + inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HET).get(GenotypeType.HOM_REF); - } - - //Count of violations of the type HOM_VAR/HOM_VAR -> HOM_REF - public int getParentsVarVarChildRef(){ - return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_REF); - } - - //Count of violations of the type HOM_VAR/HOM_VAR -> HET - public int getParentsVarVarChildHet(){ - return inheritance.get(GenotypeType.HOM_VAR).get(GenotypeType.HOM_VAR).get(GenotypeType.HET); - } - - - //Count of violations of the type HOM_VAR/? -> HOM_REF - public int getParentVarChildRef(){ - return getParentsRefVarChildRef() + getParentsVarHetChildRef() +getParentsVarVarChildRef(); - } - - //Count of violations of the type HOM_REF/? -> HOM_VAR - public int getParentRefChildVar(){ - return getParentsRefVarChildVar() + getParentsRefHetChildVar() +getParentsRefRefChildVar(); - } - - //Returns a String containing all trios where a Mendelian violation was observed. - //The String is formatted "mom1+dad1=child1,mom2+dad2=child2,..." - public String getViolationFamiliesString(){ - if(violationFamilies.isEmpty()) - return ""; - - Iterator it = violationFamilies.iterator(); - String violationFams = it.next(); - while(it.hasNext()){ - violationFams += ","+it.next(); - } - return violationFams; - } - - public List getViolationFamilies(){ - return violationFamilies; - } - - static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 }; - static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 }; - - public double getMinGenotypeQuality() { - return minGenotypeQuality; - } - - /** - * Constructor - * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - * - */ - public MendelianViolation(double minGenotypeQualityP) { - this(minGenotypeQualityP,true); - } - - /** - * Constructor - * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. - */ - public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound) { - minGenotypeQuality = minGenotypeQualityP; - this.abortOnSampleNotFound = abortOnSampleNotFound; - violationFamilies = new ArrayList(); - createInheritanceMap(); - } - - /** - * Constructor - * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation - * @param abortOnSampleNotFound - Whether to stop execution if a family is passed but no relevant genotypes are found. If false, then the family is ignored. - * @param completeTriosOnly - whether only complete trios are considered or parent/child pairs are too. - */ - public MendelianViolation(double minGenotypeQualityP, boolean abortOnSampleNotFound, boolean completeTriosOnly) { - minGenotypeQuality = minGenotypeQualityP; - this.abortOnSampleNotFound = abortOnSampleNotFound; - violationFamilies = new ArrayList(); - createInheritanceMap(); - allCalledOnly = completeTriosOnly; - } - - /** - * @param families the families to be checked for Mendelian violations - * @param vc the variant context to extract the genotypes and alleles for mom, dad and child. - * @return whether or not there is a mendelian violation at the site. - */ - public int countViolations(Map> families, VariantContext vc){ - - //Reset counts - nocall = 0; - lowQual = 0; - familyCalled = 0; - varFamilyCalled = 0; - violations_total=0; - violationFamilies.clear(); - clearInheritanceMap(); - - for(Set family : families.values()){ - Iterator sampleIterator = family.iterator(); - Sample sample; - while(sampleIterator.hasNext()){ - sample = sampleIterator.next(); - if(sample.getParents().size() > 0) - updateViolations(sample.getFamilyID(),sample.getMaternalID(), sample.getPaternalID(), sample.getID() ,vc); - } - } - return violations_total; - } - - public boolean isViolation(Sample mother, Sample father, Sample child, VariantContext vc){ - - //Reset counts - nocall = 0; - lowQual = 0; - familyCalled = 0; - varFamilyCalled = 0; - violations_total=0; - violationFamilies.clear(); - clearInheritanceMap(); - updateViolations(mother.getFamilyID(),mother.getID(),father.getID(),child.getID(),vc); - return violations_total>0; - } - - - private void updateViolations(String familyId, String motherId, String fatherId, String childId, VariantContext vc){ - - int count; - Genotype gMom = vc.getGenotype(motherId); - Genotype gDad = vc.getGenotype(fatherId); - Genotype gChild = vc.getGenotype(childId); - - if (gMom == null || gDad == null || gChild == null){ - if(abortOnSampleNotFound) - throw new IllegalArgumentException(String.format("Variant %s:%d: Missing genotypes for family %s: mom=%s dad=%s family=%s", vc.getChr(), vc.getStart(), familyId, motherId, fatherId, childId)); - else - return; - } - //Count No calls - if(allCalledOnly && (!gMom.isCalled() || !gDad.isCalled() || !gChild.isCalled())){ - nocall++; - } - else if (!gMom.isCalled() && !gDad.isCalled() || !gChild.isCalled()){ - nocall++; - } - //Count lowQual. Note that if min quality is set to 0, even values with no quality associated are returned - else if (minGenotypeQuality>0 && (gMom.getPhredScaledQual() < minGenotypeQuality || - gDad.getPhredScaledQual() < minGenotypeQuality || - gChild.getPhredScaledQual() < minGenotypeQuality )) { - lowQual++; - } - else{ - //Count all families per loci called - familyCalled++; - //If the family is all homref, not too interesting - if(!(gMom.isHomRef() && gDad.isHomRef() && gChild.isHomRef())) - { - varFamilyCalled++; - if(isViolation(gMom, gDad, gChild)){ - violationFamilies.add(familyId); - violations_total++; - } - } - count = inheritance.get(gMom.getType()).get(gDad.getType()).get(gChild.getType()); - inheritance.get(gMom.getType()).get(gDad.getType()).put(gChild.getType(),count+1); - - } - } - - /** - * Evaluate the genotypes of mom, dad, and child to detect Mendelian violations - * - * @param gMom - * @param gDad - * @param gChild - * @return true if the three genotypes represent a Mendelian violation; false otherwise - */ - public static boolean isViolation(final Genotype gMom, final Genotype gDad, final Genotype gChild) { - //1 parent is no "call - if(!gMom.isCalled()){ - return (gDad.isHomRef() && gChild.isHomVar()) || (gDad.isHomVar() && gChild.isHomRef()); - } - else if(!gDad.isCalled()){ - return (gMom.isHomRef() && gChild.isHomVar()) || (gMom.isHomVar() && gChild.isHomRef()); - } - //Both parents have genotype information - return !(gMom.getAlleles().contains(gChild.getAlleles().get(0)) && gDad.getAlleles().contains(gChild.getAlleles().get(1)) || - gMom.getAlleles().contains(gChild.getAlleles().get(1)) && gDad.getAlleles().contains(gChild.getAlleles().get(0))); - } - - private void createInheritanceMap(){ - - inheritance = new EnumMap>>(GenotypeType.class); - for(GenotypeType mType : GenotypeType.values()){ - inheritance.put(mType, new EnumMap>(GenotypeType.class)); - for(GenotypeType dType : GenotypeType.values()){ - inheritance.get(mType).put(dType, new EnumMap(GenotypeType.class)); - for(GenotypeType cType : GenotypeType.values()){ - inheritance.get(mType).get(dType).put(cType, 0); - } - } - } - - } - - private void clearInheritanceMap(){ - for(GenotypeType mType : GenotypeType.values()){ - for(GenotypeType dType : GenotypeType.values()){ - for(GenotypeType cType : GenotypeType.values()){ - inheritance.get(mType).get(dType).put(cType, 0); - } - } - } - } - - /** - * @return the likelihood ratio for a mendelian violation - */ - public double violationLikelihoodRatio(VariantContext vc, String motherId, String fatherId, String childId) { - double[] logLikAssignments = new double[27]; - // the matrix to set up is - // MOM DAD CHILD - // |- AA - // AA AA | AB - // |- BB - // |- AA - // AA AB | AB - // |- BB - // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs - double[] momGL = vc.getGenotype(motherId).getLikelihoods().getAsVector(); - double[] dadGL = vc.getGenotype(fatherId).getLikelihoods().getAsVector(); - double[] childGL = vc.getGenotype(childId).getLikelihoods().getAsVector(); - int offset = 0; - for ( int oMom = 0; oMom < 3; oMom++ ) { - for ( int oDad = 0; oDad < 3; oDad++ ) { - for ( int oChild = 0; oChild < 3; oChild ++ ) { - logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild]; - } - } - } - double[] mvLiks = new double[12]; - double[] nonMVLiks = new double[15]; - for ( int i = 0; i < 12; i ++ ) { - mvLiks[i] = logLikAssignments[mvOffsets[i]]; - } - - for ( int i = 0; i < 15; i++) { - nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]]; - } - - return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java deleted file mode 100644 index 77fc17083..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SampleUtils.java +++ /dev/null @@ -1,290 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.gatk.utils.collections.Pair; -import htsjdk.variant.vcf.VCFHeader; -import org.broadinstitute.gatk.utils.text.ListFileUtils; -import org.broadinstitute.gatk.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - - -/** - * SampleUtils is a static class (no instantiation allowed!) with some utility methods for getting samples - * quality scores. - * - * @author ebanks - */ -public class SampleUtils { - /** - * Private constructor. No instantiating this class! - */ - private SampleUtils() {} - - /** - * Pull out the samples from a SAMFileHeader; - * note that we use a TreeSet so that they are sorted - * - * @param header the sam file header - * @return list of strings representing the sample names - */ - public static Set getSAMFileSamples(final SAMFileHeader header) { - // get all of the unique sample names - final Set samples = new TreeSet(); - List readGroups = header.getReadGroups(); - for ( SAMReadGroupRecord readGroup : readGroups ) - samples.add(readGroup.getSample()); - return samples; - } - - - /** - * Same as @link getSAMFileSamples but gets all of the samples - * in the SAM files loaded by the engine - * - * @param engine engine - * @return samples - */ - public static Set getSAMFileSamples(GenomeAnalysisEngine engine) { - return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader()); - } - - /** - * Gets all of the unique sample names from all VCF rods input by the user - * - * @param toolkit GATK engine - * - * @return the set of unique samples - */ - public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit) { - return getUniqueSamplesFromRods(toolkit, null); - } - - /** - * Gets all of the unique sample names from the set of provided VCF rod names input by the user - * - * @param toolkit GATK engine - * @param rodNames list of rods to use; if null, uses all VCF rods - * - * @return the set of unique samples - */ - public static Set getUniqueSamplesFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { - Set samples = new LinkedHashSet<>(); - - for ( VCFHeader header : GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames).values() ) - samples.addAll(header.getGenotypeSamples()); - - return samples; - } - - public static Set getRodNamesWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { - return GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames).keySet(); - } - - public static Set getSampleListWithVCFHeader(GenomeAnalysisEngine toolkit, Collection rodNames) { - return getSampleList(GATKVCFUtils.getVCFHeadersFromRods(toolkit, rodNames)); - } - - public static Set getSampleList(Map headers) { - return getSampleList(headers, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE); - } - - public static Set getSampleList(Map headers, GATKVariantContextUtils.GenotypeMergeType mergeOption) { - Set samples = new TreeSet(); - for ( Map.Entry val : headers.entrySet() ) { - VCFHeader header = val.getValue(); - for ( String sample : header.getGenotypeSamples() ) { - samples.add(GATKVariantContextUtils.mergedSampleName(val.getKey(), sample, mergeOption == GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY)); - } - } - - return samples; - } - - - /** - * - * @param VCF_Headers - * @return false if there are names duplication between the samples names in the VCF headers - */ - public static boolean verifyUniqueSamplesNames(Map VCF_Headers) { - Set samples = new HashSet(); - for ( Map.Entry val : VCF_Headers.entrySet() ) { - VCFHeader header = val.getValue(); - for ( String sample : header.getGenotypeSamples() ) { - if (samples.contains(sample)){ - - return false; - } - samples.add(sample); - } - } - - return true; - } - - /** - * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap - * (e.g. sampleX.1, sampleX.2, ...) - * When finished, samples contains the uniquified sample names and rodNamesToSampleNames contains a mapping - * from rod/sample pairs to the new uniquified names - * - * @param toolkit GATK engine - * @param samples set to store the sample names - * @param rodNamesToSampleNames mapping of rod/sample pairs to new uniquified sample names - */ - public static void getUniquifiedSamplesFromRods(GenomeAnalysisEngine toolkit, Set samples, Map, String> rodNamesToSampleNames) { - - // keep a map of sample name to occurrences encountered - HashMap sampleOverlapMap = new HashMap(); - - // iterate to get all of the sample names - - for ( Map.Entry pair : GATKVCFUtils.getVCFHeadersFromRods(toolkit).entrySet() ) { - for ( String sample : pair.getValue().getGenotypeSamples() ) - addUniqueSample(samples, sampleOverlapMap, rodNamesToSampleNames, sample, pair.getKey()); - } - } - - private static void addUniqueSample(Set samples, Map sampleOverlapMap, Map, String> rodNamesToSampleNames, String newSample, String rodName) { - - // how many occurrences have we seen so far? - Integer occurrences = sampleOverlapMap.get(newSample); - - // if this is the first one, just add it to the list of samples - if ( occurrences == null ) { - samples.add(newSample); - rodNamesToSampleNames.put(new Pair(rodName, newSample), newSample); - sampleOverlapMap.put(newSample, 1); - } - - // if it's already been seen multiple times, give it a unique suffix and increment the value - else if ( occurrences >= 2 ) { - String uniqueName = newSample + "." + rodName; - samples.add(uniqueName); - rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName); - sampleOverlapMap.put(newSample, occurrences + 1); - } - - // if this is the second occurrence of the sample name, uniquify both of them - else { // occurrences == 2 - - // remove the 1st occurrence, uniquify it, and add it back - samples.remove(newSample); - String uniqueName1 = null; - for ( Map.Entry, String> entry : rodNamesToSampleNames.entrySet() ) { - if ( entry.getValue().equals(newSample) ) { - uniqueName1 = newSample + "." + entry.getKey().first; - entry.setValue(uniqueName1); - break; - } - } - samples.add(uniqueName1); - - // add the second one - String uniqueName2 = newSample + "." + rodName; - samples.add(uniqueName2); - rodNamesToSampleNames.put(new Pair(rodName, newSample), uniqueName2); - - sampleOverlapMap.put(newSample, 2); - } - - } - - /** - * Returns a new set of samples, containing a final list of samples expanded from sampleArgs - * - * Each element E of sampleArgs can either be a literal sample name or a file. For each E, - * we try to read a file named E from disk, and if possible all lines from that file are expanded - * into unique sample names. - * - * @param sampleArgs args - * @return samples - */ - public static Set getSamplesFromCommandLineInput(Collection sampleArgs) { - if (sampleArgs != null) { - return ListFileUtils.unpackSet(sampleArgs); - } - - return new HashSet(); - } - - public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) { - Set samples = ListFileUtils.unpackSet(vcfSamples); - if (sampleExpressions == null) { - return samples; - } else { - return ListFileUtils.includeMatching(samples, sampleExpressions, false); - } - } - - /** - * Given a collection of samples and a collection of regular expressions, generates the set of samples that match each expression - * @param originalSamples list of samples to select samples from - * @param sampleExpressions list of expressions to use for matching samples - * @return the set of samples from originalSamples that satisfy at least one of the expressions in sampleExpressions - */ - public static Collection matchSamplesExpressions (Collection originalSamples, Collection sampleExpressions) { - // Now, check the expressions that weren't used in the previous step, and use them as if they're regular expressions - Set samples = new HashSet(); - if (sampleExpressions != null) { - samples.addAll(ListFileUtils.includeMatching(originalSamples, sampleExpressions, false)); - } - return samples; - } - - /** - * Given a list of files with sample names it reads all files and creates a list of unique samples from all these files. - * @param files list of files with sample names in - * @return a collection of unique samples from all files - */ - public static Collection getSamplesFromFiles (Collection files) { - Set samplesFromFiles = new HashSet(); - if (files != null) { - for (File file : files) { - try { - XReadLines reader = new XReadLines(file); - List lines = reader.readLines(); - for (String line : lines) { - samplesFromFiles.add(line); - } - } catch (FileNotFoundException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - } - } - return samplesFromFiles; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java deleted file mode 100644 index d869037f8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java +++ /dev/null @@ -1,527 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Sep 10, 2010 - * Time: 1:56:24 PM - * - * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, - * from BAMs, or from RODs -- for consistency. The system supports two basic modes: get an enum state that - * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will - * blow up with a UserException if the dicts are too incompatible. - * - * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, - * if available. Examines the Engine arguments to decided if the -U option to allow danger seq dict inconsistency - * is enabled before it blows up. - */ -public class SequenceDictionaryUtils { - // - // for detecting lexicographically sorted human references - // - private static final boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true; - - // hg18 - protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); - protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); - protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); - - // hg19 - protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); - protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); - protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); - - // b36 - protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719); - protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149); - protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737); - - // b37 - protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621); - protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373); - protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747); - - - public enum SequenceDictionaryCompatibility { - IDENTICAL, // the dictionaries are identical - COMMON_SUBSET, // there exists a common subset of equivalent contigs - NO_COMMON_CONTIGS, // no overlap between dictionaries - UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths - NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine) - OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different - // orders with respect to each other - DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same - // order with respect to each other, but one or more of them have different - // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } - } - - /** - * @param validationExclusion exclusions to validation - * @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency - */ - private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) { - return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY || - validationExclusion == ValidationExclusion.TYPE.ALL ); - } - - /** - * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then - * UserExceptions are thrown with detailed error messages. If the engine is in permissive mode, then - * logger warnings are generated instead. - * - * @param logger for warnings - * @param validationExclusion exclusions to validation - * @param name1 name associated with dict1 - * @param dict1 the sequence dictionary dict1 - * @param name2 name associated with dict2 - * @param dict2 the sequence dictionary dict2 - * @param isReadsToReferenceComparison true if one of the dictionaries comes from a reads data source (eg., a BAM), - * and the other from a reference data source - * @param intervals the user-specified genomic intervals: only required when isReadsToReferenceComparison is true, - * otherwise can be null - */ - public static void validateDictionaries( final Logger logger, - final ValidationExclusion.TYPE validationExclusion, - final String name1, - final SAMSequenceDictionary dict1, - final String name2, - final SAMSequenceDictionary dict2, - final boolean isReadsToReferenceComparison, - final GenomeLocSortedSet intervals ) { - - final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2); - - switch ( type ) { - case IDENTICAL: - return; - case COMMON_SUBSET: - return; - case NO_COMMON_CONTIGS: - throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); - - case UNEQUAL_COMMON_CONTIGS: { - List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); - SAMSequenceRecord elt1 = x.get(0); - SAMSequenceRecord elt2 = x.get(1); - - // todo -- replace with toString when SAMSequenceRecord has a nice toString routine - UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", - name1, elt1.getSequenceName(), elt1.getSequenceLength(), - name2, elt2.getSequenceName(), elt2.getSequenceLength()), - name1, dict1, name2, dict2); - - if ( allowNonFatalIncompabilities(validationExclusion) ) - logger.warn(ex.getMessage()); - else - throw ex; - break; - } - - case NON_CANONICAL_HUMAN_ORDER: { - UserException ex; - if ( nonCanonicalHumanContigOrder(dict1) ) - ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); - else - ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); - - if ( allowNonFatalIncompabilities(validationExclusion) ) - logger.warn(ex.getMessage()); - else - throw ex; - break; - } - - case OUT_OF_ORDER: { - UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2); - if ( allowNonFatalIncompabilities(validationExclusion) ) - logger.warn(ex.getMessage()); - else - throw ex; - break; - } - - case DIFFERENT_INDICES: { - // This is currently only known to be problematic when the index mismatch is between a bam and the - // reference AND when the user's intervals actually include one or more of the contigs that are - // indexed differently from the reference. In this case, the engine will fail to correctly serve - // up the reads from those contigs, so throw an exception unless unsafe operations are enabled. - if ( isReadsToReferenceComparison && intervals != null ) { - - final Set misindexedContigs = findMisindexedContigsInIntervals(intervals, dict1, dict2); - - if ( ! misindexedContigs.isEmpty() ) { - final String msg = String.format("The following contigs included in the intervals to process have " + - "different indices in the sequence dictionaries for the reads vs. " + - "the reference: %s. As a result, the GATK engine will not correctly " + - "process reads from these contigs. You should either fix the sequence " + - "dictionaries for your reads so that these contigs have the same indices " + - "as in the sequence dictionary for your reference, or exclude these contigs " + - "from your intervals. This error can be disabled via -U %s, " + - "however this is not recommended as the GATK engine will not behave correctly.", - misindexedContigs, ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY); - final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); - - if ( allowNonFatalIncompabilities(validationExclusion) ) - logger.warn(ex.getMessage()); - else - throw ex; - } - } - break; - } - - default: - throw new ReviewedGATKException("Unexpected SequenceDictionaryComparison type: " + type); - } - } - - /** - * Workhorse routine that takes two dictionaries and returns their compatibility. - * - * @param dict1 first sequence dictionary - * @param dict2 second sequence dictionary - * @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries - */ - public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { - if ( nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2) ) - return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; - - final Set commonContigs = getCommonContigsByName(dict1, dict2); - - if (commonContigs.size() == 0) - return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; - else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) - return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; - else if ( ! commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2) ) - return SequenceDictionaryCompatibility.OUT_OF_ORDER; - else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) - return SequenceDictionaryCompatibility.IDENTICAL; - else if ( ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) - return SequenceDictionaryCompatibility.DIFFERENT_INDICES; - else { - return SequenceDictionaryCompatibility.COMMON_SUBSET; - } - } - - /** - * Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means - * that the seq records have the same length, if both are non-zero. - * - * @param commonContigs - * @param dict1 - * @param dict2 - * @return true if all of the common contigs are equivalent - */ - private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; - } - - /** - * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns - * null if all common contigs are equivalent - * - * @param commonContigs - * @param dict1 - * @param dict2 - * @return - */ - private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - for ( String name : commonContigs ) { - SAMSequenceRecord elt1 = dict1.getSequence(name); - SAMSequenceRecord elt2 = dict2.getSequence(name); - if ( ! sequenceRecordsAreEquivalent(elt1, elt2) ) - return Arrays.asList(elt1,elt2); - } - - return null; - } - - /** - * Helper routine that returns two sequence records are equivalent, defined as having the same name and - * lengths, if both are non-zero - * - * @param me - * @param that - * @return - */ - private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) { - if (me == that) return true; - if (that == null) return false; - - if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength()) - return false; - - // todo -- reenable if we want to be really strict here -// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) { -// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); -// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); -// if (!thisMd5.equals(thatMd5)) { -// return false; -// } -// } -// else { - if (me.getSequenceName() != that.getSequenceName()) - return false; // Compare using == since we intern() the Strings -// } - - return true; - } - - /** - * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18/hg19) and if it's - * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these - * are all matched, requiring that the order be chr1, chr2, chr10. - * - * @param dict - * @return - */ - private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { - if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false - return false; - - SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; - - for ( SAMSequenceRecord elt : dict.getSequences() ) { - if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt; - if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt; - if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt; - } - - if ( chr1 != null && chr2 != null && chr10 != null) { - // we found them all - return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); - } else { - return false; - } - } - - /** - * Trivial helper that returns true if elt has the same length as rec1 or rec2 - * @param elt record to test - * @param rec1 first record to test for length equivalence - * @param rec2 first record to test for length equivalence - * @return true if elt has the same length as either rec1 or rec2 - */ - private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord rec1, SAMSequenceRecord rec2 ) { - return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength(); - } - - /** - * Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to - * absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting - * these according to their indices, and then walking through the sorted list to ensure that each ordered contig - * is equivalent - * - * @param commonContigs names of the contigs common to both dictionaries - * @param dict1 first SAMSequenceDictionary - * @param dict2 second SAMSequenceDictionary - * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false - */ - private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - List list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1)); - List list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2)); - - for ( int i = 0; i < list1.size(); i++ ) { - SAMSequenceRecord elt1 = list1.get(i); - SAMSequenceRecord elt2 = list2.get(i); - if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) ) - return false; - } - - return true; - } - - /** - * Gets the subset of SAMSequenceRecords in commonContigs in dict - * - * @param commonContigs - * @param dict - * @return - */ - private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { - List l = new ArrayList(commonContigs.size()); - for ( String name : commonContigs ) { - l.add(dict.getSequence(name) ); - } - - return l; - } - - /** - * Compares sequence records by their order - */ - private static class CompareSequenceRecordsByIndex implements Comparator { - public int compare(SAMSequenceRecord x, SAMSequenceRecord y) { - return Integer.valueOf(x.getSequenceIndex()).compareTo(y.getSequenceIndex()); - } - } - - /** - * Returns a sorted list of SAMSequenceRecords sorted by their indices. Note that the - * list is modified in place, so the returned list is == to the unsorted list. - * - * @param unsorted - * @return - */ - private static List sortSequenceListByIndex(List unsorted) { - Collections.sort(unsorted, new CompareSequenceRecordsByIndex()); - return unsorted; - } - - /** - * Checks whether the common contigs in the given sequence dictionaries occur at the same indices - * in both dictionaries - * - * @param commonContigs Set of names of the contigs that occur in both dictionaries - * @param dict1 first sequence dictionary - * @param dict2 second sequence dictionary - * @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries, - * otherwise false - */ - private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { - for ( String commonContig : commonContigs ) { - SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); - SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); - - // Each common contig must have the same index in both dictionaries - if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { - return false; - } - } - - return true; - } - - /** - * Gets the set of names of the contigs found in both sequence dictionaries that have different indices - * in the two dictionaries. - * - * @param commonContigs Set of names of the contigs common to both dictionaries - * @param dict1 first sequence dictionary - * @param dict2 second sequence dictionary - * @return a Set containing the names of the common contigs indexed differently in dict1 vs. dict2, - * or an empty Set if there are no such contigs - */ - private static Set getDifferentlyIndexedCommonContigs( final Set commonContigs, - final SAMSequenceDictionary dict1, - final SAMSequenceDictionary dict2 ) { - - final Set differentlyIndexedCommonContigs = new LinkedHashSet(Utils.optimumHashSize(commonContigs.size())); - - for ( String commonContig : commonContigs ) { - if ( dict1.getSequence(commonContig).getSequenceIndex() != dict2.getSequence(commonContig).getSequenceIndex() ) { - differentlyIndexedCommonContigs.add(commonContig); - } - } - - return differentlyIndexedCommonContigs; - } - - /** - * Finds the names of any contigs indexed differently in the two sequence dictionaries that also - * occur in the provided set of intervals. - * - * @param intervals GenomeLocSortedSet containing the intervals to check - * @param dict1 first sequence dictionary - * @param dict2 second sequence dictionary - * @return a Set of the names of the contigs indexed differently in dict1 vs dict2 that also - * occur in the provided intervals, or an empty Set if there are no such contigs - */ - private static Set findMisindexedContigsInIntervals( final GenomeLocSortedSet intervals, - final SAMSequenceDictionary dict1, - final SAMSequenceDictionary dict2 ) { - - final Set differentlyIndexedCommonContigs = getDifferentlyIndexedCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); - final Set misindexedContigsInIntervals = new LinkedHashSet(Utils.optimumHashSize(differentlyIndexedCommonContigs.size())); - - // We know differentlyIndexedCommonContigs is a HashSet, so this loop is O(intervals) - for ( GenomeLoc interval : intervals ) { - if ( differentlyIndexedCommonContigs.contains(interval.getContig()) ) { - misindexedContigsInIntervals.add(interval.getContig()); - } - } - - return misindexedContigsInIntervals; - } - - /** - * Returns the set of contig names found in both dicts. - * @param dict1 - * @param dict2 - * @return - */ - public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { - Set intersectingSequenceNames = getContigNames(dict1); - intersectingSequenceNames.retainAll(getContigNames(dict2)); - return intersectingSequenceNames; - } - - public static Set getContigNames(SAMSequenceDictionary dict) { - Set contigNames = new HashSet(Utils.optimumHashSize(dict.size())); - for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) - contigNames.add(dictionaryEntry.getSequenceName()); - return contigNames; - } - - /** - * Returns a compact String representation of the sequence dictionary it's passed - * - * The format of the returned String is: - * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ] - * - * @param dict a non-null SAMSequenceDictionary - * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed - */ - public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { - if ( dict == null ) { - throw new IllegalArgumentException("Sequence dictionary must be non-null"); - } - - StringBuilder s = new StringBuilder("[ "); - - for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { - s.append(dictionaryEntry.getSequenceName()); - s.append("(length:"); - s.append(dictionaryEntry.getSequenceLength()); - s.append(") "); - } - - s.append("]"); - - return s.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java deleted file mode 100644 index d664ef689..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Utils.java +++ /dev/null @@ -1,1186 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMProgramRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; -import org.broadinstitute.gatk.utils.text.TextFormattingUtils; - -import java.lang.reflect.Array; -import java.math.BigInteger; -import java.net.InetAddress; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: Feb 24, 2009 - * Time: 10:12:31 AM - * To change this template use File | Settings | File Templates. - */ -public class Utils { - /** our log, which we want to capture anything from this class */ - private static Logger logger = Logger.getLogger(Utils.class); - - public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; - - /** - * Boolean xor operation. Only true if x != y. - * - * @param x a boolean - * @param y a boolean - * @return true if x != y - */ - public static boolean xor(final boolean x, final boolean y) { - return x != y; - } - - /** - * Calculates the optimum initial size for a hash table given the maximum number - * of elements it will need to hold. The optimum size is the smallest size that - * is guaranteed not to result in any rehash/table-resize operations. - * - * @param maxElements The maximum number of elements you expect the hash table - * will need to hold - * @return The optimum initial size for the table, given maxElements - */ - public static int optimumHashSize ( int maxElements ) { - return (int)(maxElements / JAVA_DEFAULT_HASH_LOAD_FACTOR) + 2; - } - - /** - * Compares two objects, either of which might be null. - * - * @param lhs One object to compare. - * @param rhs The other object to compare. - * - * @return True if the two objects are equal, false otherwise. - */ - public static boolean equals(Object lhs, Object rhs) { - return lhs == null && rhs == null || lhs != null && lhs.equals(rhs); - } - - public static List cons(final T elt, final List l) { - List l2 = new ArrayList(); - l2.add(elt); - if (l != null) l2.addAll(l); - return l2; - } - - public static void warnUser(final String msg) { - warnUser(logger, msg); - } - - public static void warnUser(final Logger logger, final String msg) { - logger.warn(String.format("********************************************************************************")); - logger.warn(String.format("* WARNING:")); - logger.warn(String.format("*")); - prettyPrintWarningMessage(logger, msg); - logger.warn(String.format("********************************************************************************")); - } - - /** - * pretty print the warning message supplied - * - * @param logger logger for the message - * @param message the message - */ - private static void prettyPrintWarningMessage(Logger logger, String message) { - StringBuilder builder = new StringBuilder(message); - while (builder.length() > 70) { - int space = builder.lastIndexOf(" ", 70); - if (space <= 0) space = 70; - logger.warn(String.format("* %s", builder.substring(0, space))); - builder.delete(0, space + 1); - } - logger.warn(String.format("* %s", builder)); - } - - /** - * join the key value pairs of a map into one string, i.e. myMap = [A->1,B->2,C->3] with a call of: - * joinMap("-","*",myMap) -> returns A-1*B-2*C-3 - * - * Be forewarned, if you're not using a map that is aware of the ordering (i.e. HashMap instead of LinkedHashMap) - * the ordering of the string you get back might not be what you expect! (i.e. C-3*A-1*B-2 vrs A-1*B-2*C-3) - * - * @param keyValueSeperator the string to seperate the key-value pairs - * @param recordSeperator the string to use to seperate each key-value pair from other key-value pairs - * @param map the map to draw from - * @param the map's key type - * @param the map's value type - * @return a string representing the joined map - */ - public static String joinMap(String keyValueSeperator, String recordSeperator, Map map) { - if (map.size() < 1) { return null; } - String joinedKeyValues[] = new String[map.size()]; - int index = 0; - for (L key : map.keySet()) { - joinedKeyValues[index++] = String.format("%s%s%s",key.toString(),keyValueSeperator,map.get(key).toString()); - } - return join(recordSeperator,joinedKeyValues); - } - - /** - * Splits a String using indexOf instead of regex to speed things up. - * - * @param str the string to split. - * @param delimiter the delimiter used to split the string. - * @return an array of tokens. - */ - public static ArrayList split(String str, String delimiter) { - return split(str, delimiter, 10); - } - - /** - * Splits a String using indexOf instead of regex to speed things up. - * - * @param str the string to split. - * @param delimiter the delimiter used to split the string. - * @param expectedNumTokens The number of tokens expected. This is used to initialize the ArrayList. - * @return an array of tokens. - */ - public static ArrayList split(String str, String delimiter, int expectedNumTokens) { - final ArrayList result = new ArrayList(expectedNumTokens); - - int delimiterIdx = -1; - do { - final int tokenStartIdx = delimiterIdx + 1; - delimiterIdx = str.indexOf(delimiter, tokenStartIdx); - final String token = (delimiterIdx != -1 ? str.substring(tokenStartIdx, delimiterIdx) : str.substring(tokenStartIdx) ); - result.add(token); - } while( delimiterIdx != -1 ); - - return result; - } - - - /** - * join an array of strings given a seperator - * @param separator the string to insert between each array element - * @param strings the array of strings - * @return a string, which is the joining of all array values with the separator - */ - public static String join(String separator, String[] strings) { - return join(separator, strings, 0, strings.length); - } - - public static String join(String separator, String[] strings, int start, int end) { - if ((end - start) == 0) { - return ""; - } - StringBuilder ret = new StringBuilder(strings[start]); - for (int i = start + 1; i < end; ++i) { - ret.append(separator); - ret.append(strings[i]); - } - return ret.toString(); - } - - public static String join(String separator, int[] ints) { - if ( ints == null || ints.length == 0) - return ""; - else { - StringBuilder ret = new StringBuilder(); - ret.append(ints[0]); - for (int i = 1; i < ints.length; ++i) { - ret.append(separator); - ret.append(ints[i]); - } - return ret.toString(); - } - } - - /** - * Create a new list that contains the elements of left along with elements elts - * @param left a non-null list of elements - * @param elts a varargs vector for elts to append in order to left - * @return A newly allocated linked list containing left followed by elts - */ - public static List append(final List left, T ... elts) { - final List l = new LinkedList(left); - l.addAll(Arrays.asList(elts)); - return l; - } - - /** - * Returns a string of the values in joined by separator, such as A,B,C - * - * @param separator separator character - * @param doubles the array with values - * @return a string with the values separated by the separator - */ - public static String join(String separator, double[] doubles) { - if ( doubles == null || doubles.length == 0) - return ""; - else { - StringBuilder ret = new StringBuilder(); - ret.append(doubles[0]); - for (int i = 1; i < doubles.length; ++i) { - ret.append(separator); - ret.append(doubles[i]); - } - return ret.toString(); - } - } - - /** - * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of - * elti objects (note there's no actual space between sep and the elti elements). Returns - * "" if collection is empty. If collection contains just elt, then returns elt.toString() - * - * @param separator the string to use to separate objects - * @param objects a collection of objects. the element order is defined by the iterator over objects - * @param the type of the objects - * @return a non-null string - */ - public static String join(final String separator, final Collection objects) { - if (objects.isEmpty()) { // fast path for empty collection - return ""; - } else { - final Iterator iter = objects.iterator(); - final T first = iter.next(); - - if ( ! iter.hasNext() ) // fast path for singleton collections - return first.toString(); - else { // full path for 2+ collection that actually need a join - final StringBuilder ret = new StringBuilder(first.toString()); - while(iter.hasNext()) { - ret.append(separator); - ret.append(iter.next().toString()); - } - return ret.toString(); - } - } - } - - /** - * Returns a {@link List List<Integer>} representation of an primitive int array. - * @param values the primitive int array to represent. - * @return never code {@code null}. The returned list will be unmodifiable yet it will reflect changes in values in the original array yet - * you cannot change the values - */ - public static List asList(final int ... values) { - if (values == null) - throw new IllegalArgumentException("the input array cannot be null"); - return new AbstractList() { - - @Override - public Integer get(final int index) { - return values[index]; - } - - @Override - public int size() { - return values.length; - } - }; - } - - /** - * Returns a {@link List List<Double>} representation of an primitive double array. - * @param values the primitive int array to represent. - * @return never code {@code null}. The returned list will be unmodifiable yet it will reflect changes in values in the original array yet - * you cannot change the values. - */ - public static List asList(final double ... values) { - if (values == null) - throw new IllegalArgumentException("the input array cannot be null"); - return new AbstractList() { - - @Override - public Double get(final int index) { - return values[index]; - } - - @Override - public int size() { - return values.length; - } - }; - } - - public static String join(final String separator, final T ... objects) { - return join(separator, Arrays.asList(objects)); - } - - /** - * Create a new string thats a n duplicate copies of s - * @param s the string to duplicate - * @param nCopies how many copies? - * @return a string - */ - public static String dupString(final String s, int nCopies) { - if ( s == null || s.equals("") ) throw new IllegalArgumentException("Bad s " + s); - if ( nCopies < 0 ) throw new IllegalArgumentException("nCopies must be >= 0 but got " + nCopies); - - final StringBuilder b = new StringBuilder(); - for ( int i = 0; i < nCopies; i++ ) - b.append(s); - return b.toString(); - } - - public static String dupString(char c, int nCopies) { - char[] chars = new char[nCopies]; - Arrays.fill(chars, c); - return new String(chars); - } - - public static byte[] dupBytes(byte b, int nCopies) { - byte[] bytes = new byte[nCopies]; - Arrays.fill(bytes, b); - return bytes; - } - - // trim a string for the given character (i.e. not just whitespace) - public static String trim(String str, char ch) { - char[] array = str.toCharArray(); - - - int start = 0; - while ( start < array.length && array[start] == ch ) - start++; - - int end = array.length - 1; - while ( end > start && array[end] == ch ) - end--; - - return str.substring(start, end+1); - } - - /** - * Splits expressions in command args by spaces and returns the array of expressions. - * Expressions may use single or double quotes to group any individual expression, but not both. - * @param args Arguments to parse. - * @return Parsed expressions. - */ - public static String[] escapeExpressions(String args) { - // special case for ' and " so we can allow expressions - if (args.indexOf('\'') != -1) - return escapeExpressions(args, "'"); - else if (args.indexOf('\"') != -1) - return escapeExpressions(args, "\""); - else - return args.trim().split(" +"); - } - - /** - * Splits expressions in command args by spaces and the supplied delimiter and returns the array of expressions. - * @param args Arguments to parse. - * @param delimiter Delimiter for grouping expressions. - * @return Parsed expressions. - */ - private static String[] escapeExpressions(String args, String delimiter) { - String[] command = {}; - String[] split = args.split(delimiter); - String arg; - for (int i = 0; i < split.length - 1; i += 2) { - arg = split[i].trim(); - if (arg.length() > 0) // if the unescaped arg has a size - command = Utils.concatArrays(command, arg.split(" +")); - command = Utils.concatArrays(command, new String[]{split[i + 1]}); - } - arg = split[split.length - 1].trim(); - if (split.length % 2 == 1) // if the command ends with a delimiter - if (arg.length() > 0) // if the last unescaped arg has a size - command = Utils.concatArrays(command, arg.split(" +")); - return command; - } - - /** - * Concatenates two String arrays. - * @param A First array. - * @param B Second array. - * @return Concatenation of A then B. - */ - public static String[] concatArrays(String[] A, String[] B) { - String[] C = new String[A.length + B.length]; - System.arraycopy(A, 0, C, 0, A.length); - System.arraycopy(B, 0, C, A.length, B.length); - return C; - } - - /** - * Concatenates byte arrays - * @return a concat of all bytes in allBytes in order - */ - public static byte[] concat(final byte[] ... allBytes) { - int size = 0; - for ( final byte[] bytes : allBytes ) size += bytes.length; - - final byte[] c = new byte[size]; - int offset = 0; - for ( final byte[] bytes : allBytes ) { - System.arraycopy(bytes, 0, c, offset, bytes.length); - offset += bytes.length; - } - - return c; - } - - /** - * Appends String(s) B to array A. - * @param A First array. - * @param B Strings to append. - * @return A with B(s) appended. - */ - public static String[] appendArray(String[] A, String... B) { - return concatArrays(A, B); - } - - public static > List sorted(Collection c) { - return sorted(c, false); - } - - public static > List sorted(Collection c, boolean reverse) { - List l = new ArrayList(c); - Collections.sort(l); - if ( reverse ) Collections.reverse(l); - return l; - } - - public static , V> List sorted(Map c) { - return sorted(c, false); - } - - public static , V> List sorted(Map c, boolean reverse) { - List t = new ArrayList(c.keySet()); - Collections.sort(t); - if ( reverse ) Collections.reverse(t); - - List l = new ArrayList(); - for ( T k : t ) { - l.add(c.get(k)); - } - return l; - } - - /** - * Reverse a byte array of bases - * - * @param bases the byte array of bases - * @return the reverse of the base byte array - */ - static public byte[] reverse(byte[] bases) { - byte[] rcbases = new byte[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = bases[bases.length - i - 1]; - } - - return rcbases; - } - - static public List reverse(final List l) { - final List newL = new ArrayList(l); - Collections.reverse(newL); - return newL; - } - - /** - * Reverse an int array of bases - * - * @param bases the int array of bases - * @return the reverse of the base int array - */ - static public int[] reverse(int[] bases) { - int[] rcbases = new int[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = bases[bases.length - i - 1]; - } - - return rcbases; - } - - /** - * Reverse (NOT reverse-complement!!) a string - * - * @param bases input string - * @return the reversed string - */ - static public String reverse(String bases) { - return new String( reverse( bases.getBytes() )) ; - } - - public static boolean isFlagSet(int value, int flag) { - return ((value & flag) == flag); - } - - /** - * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails, - * unresolvable gets returned instead. - */ - public static String resolveHostname() { - try { - return InetAddress.getLocalHost().getCanonicalHostName(); - } - catch (java.net.UnknownHostException uhe) { // [beware typo in code sample -dmw] - return "unresolvable"; - // handle exception - } - } - - - public static byte [] arrayFromArrayWithLength(byte[] array, int length) { - byte [] output = new byte[length]; - for (int j = 0; j < length; j++) - output[j] = array[(j % array.length)]; - return output; - } - - public static void fillArrayWithByte(byte[] array, byte value) { - for (int i=0; i oldRecords = header.getProgramRecords(); - final List newRecords = new ArrayList(oldRecords.size()+1); - for ( SAMProgramRecord record : oldRecords ) - if ( (programRecord != null && !record.getId().startsWith(programRecord.getId()))) - newRecords.add(record); - - if (programRecord != null) { - newRecords.add(programRecord); - header.setProgramRecords(newRecords); - } - return header; - } - - /** - * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and returns - * the new header to be added to the BAM writer. - * - * @param toolkit the engine - * @param walker the walker object (so we can extract the command line) - * @param PROGRAM_RECORD_NAME the name for the PG tag - * @return a pre-filled header for the bam writer - */ - public static SAMFileHeader setupWriter(final GenomeAnalysisEngine toolkit, final SAMFileHeader originalHeader, final Object walker, final String PROGRAM_RECORD_NAME) { - final SAMProgramRecord programRecord = createProgramRecord(toolkit, walker, PROGRAM_RECORD_NAME); - return setupWriter(originalHeader, programRecord); - } - - /** - * Creates a program record for the program, adds it to the list of program records (@PG tags) in the bam file and sets - * up the writer with the header and presorted status. - * - * @param writer BAM file writer - * @param toolkit the engine - * @param preSorted whether or not the writer can assume reads are going to be added are already sorted - * @param walker the walker object (so we can extract the command line) - * @param PROGRAM_RECORD_NAME the name for the PG tag - */ - public static void setupWriter(GATKSAMFileWriter writer, GenomeAnalysisEngine toolkit, SAMFileHeader originalHeader, boolean preSorted, Object walker, String PROGRAM_RECORD_NAME) { - SAMFileHeader header = setupWriter(toolkit, originalHeader, walker, PROGRAM_RECORD_NAME); - writer.writeHeader(header); - writer.setPresorted(preSorted); - } - - - /** - * Creates a program record (@PG) tag - * - * @param toolkit the engine - * @param walker the walker object (so we can extract the command line) - * @param PROGRAM_RECORD_NAME the name for the PG tag - * @return a program record for the tool - */ - public static SAMProgramRecord createProgramRecord(GenomeAnalysisEngine toolkit, Object walker, String PROGRAM_RECORD_NAME) { - final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); - final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); - try { - final String version = headerInfo.getString("org.broadinstitute.gatk.tools.version"); - programRecord.setProgramVersion(version); - } catch (MissingResourceException e) { - // couldn't care less if the resource is missing... - } - programRecord.setCommandLine(toolkit.createApproximateCommandLineArgumentString(toolkit, walker)); - return programRecord; - } - - /** - * Returns the number of combinations represented by this collection - * of collection of options. - * - * For example, if this is [[A, B], [C, D], [E, F, G]] returns 2 * 2 * 3 = 12 - */ - @Requires("options != null") - public static int nCombinations(final Collection[] options) { - int nStates = 1; - for ( Collection states : options ) { - nStates *= states.size(); - } - return nStates; - } - - @Requires("options != null") - public static int nCombinations(final List> options) { - if ( options.isEmpty() ) - return 0; - else { - int nStates = 1; - for ( Collection states : options ) { - nStates *= states.size(); - } - return nStates; - } - } - - /** - * Make all combinations of N size of objects - * - * if objects = [A, B, C] - * if N = 1 => [[A], [B], [C]] - * if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]] - * - * @param objects list of objects - * @param n size of each combination - * @param withReplacement if false, the resulting permutations will only contain unique objects from objects - * @return a list with all combinations with size n of objects. - */ - public static List> makePermutations(final List objects, final int n, final boolean withReplacement) { - final List> combinations = new ArrayList>(); - - if ( n == 1 ) { - for ( final T o : objects ) - combinations.add(Collections.singletonList(o)); - } else if (n > 1) { - final List> sub = makePermutations(objects, n - 1, withReplacement); - for ( List subI : sub ) { - for ( final T a : objects ) { - if ( withReplacement || ! subI.contains(a) ) - combinations.add(Utils.cons(a, subI)); - } - } - } - - return combinations; - } - - /** - * Convenience function that formats the novelty rate as a %.2f string - * - * @param known number of variants from all that are known - * @param all number of all variants - * @return a String novelty rate, or NA if all == 0 - */ - public static String formattedNoveltyRate(final int known, final int all) { - return formattedPercent(all - known, all); - } - - /** - * Convenience function that formats the novelty rate as a %.2f string - * - * @param x number of objects part of total that meet some criteria - * @param total count of all objects, including x - * @return a String percent rate, or NA if total == 0 - */ - public static String formattedPercent(final long x, final long total) { - return total == 0 ? "NA" : String.format("%.2f", (100.0*x) / total); - } - - /** - * Convenience function that formats a ratio as a %.2f string - * - * @param num number of observations in the numerator - * @param denom number of observations in the denumerator - * @return a String formatted ratio, or NA if all == 0 - */ - public static String formattedRatio(final long num, final long denom) { - return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); - } - - /** - * Adds element from an array into a collection. - * - * In the event of exception being throw due to some element, dest might have been modified by - * the successful addition of element before that one. - * - * @param dest the destination collection which cannot be null and should be able to accept - * the input elements. - * @param elements the element to add to dest - * @param collection type element. - * @throws UnsupportedOperationException if the add operation - * is not supported by dest. - * @throws ClassCastException if the class of any of the elements - * prevents it from being added to dest. - * @throws NullPointerException if any of the elements is null and dest - * does not permit null elements - * @throws IllegalArgumentException if some property of any of the elements - * prevents it from being added to this collection - * @throws IllegalStateException if any of the elements cannot be added at this - * time due to insertion restrictions. - * @return true if the collection was modified as a result. - */ - public static boolean addAll(Collection dest, T ... elements) { - boolean result = false; - for (final T e : elements) { - result = dest.add(e) | result; - } - return result; - } - - /** - * Create a constant map that maps each value in values to itself - */ - public static Map makeIdentityFunctionMap(Collection values) { - Map map = new HashMap(values.size()); - for ( final T value : values ) - map.put(value, value); - return Collections.unmodifiableMap(map); - } - - /** - * Divides the input list into a list of sublists, which contains group size elements (except potentially the last one) - * - * list = [A, B, C, D, E] - * groupSize = 2 - * result = [[A, B], [C, D], [E]] - * - */ - public static List> groupList(final List list, final int groupSize) { - if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1"); - - final List> subLists = new LinkedList>(); - int n = list.size(); - for ( int i = 0; i < n; i += groupSize ) { - subLists.add(list.subList(i, Math.min(i + groupSize, n))); - } - return subLists; - } - - /** - * @see #calcMD5(byte[]) - */ - public static String calcMD5(final String s) { - return calcMD5(s.getBytes()); - } - - /** - * Calculate the md5 for bytes, and return the result as a 32 character string - * - * @param bytes the bytes to calculate the md5 of - * @return the md5 of bytes, as a 32-character long string - */ - @Ensures({"result != null", "result.length() == 32"}) - public static String calcMD5(final byte[] bytes) { - if ( bytes == null ) throw new IllegalArgumentException("bytes cannot be null"); - try { - final byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytes); - final BigInteger bigInt = new BigInteger(1, thedigest); - - String md5String = bigInt.toString(16); - while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32 - return md5String; - } - catch ( NoSuchAlgorithmException e ) { - throw new IllegalStateException("MD5 digest algorithm not present"); - } - } - - /** - * Does big end with the exact sequence of bytes in suffix? - * - * @param big a non-null byte[] to test if it a prefix + suffix - * @param suffix a non-null byte[] to test if it's a suffix of big - * @return true if big is proper byte[] composed of some prefix + suffix - */ - public static boolean endsWith(final byte[] big, final byte[] suffix) { - if ( big == null ) throw new IllegalArgumentException("big cannot be null"); - if ( suffix == null ) throw new IllegalArgumentException("suffix cannot be null"); - return new String(big).endsWith(new String(suffix)); - } - - /** - * Get the length of the longest common prefix of seq1 and seq2 - * @param seq1 non-null byte array - * @param seq2 non-null byte array - * @param maxLength the maximum allowed length to return - * @return the length of the longest common prefix of seq1 and seq2, >= 0 - */ - public static int longestCommonPrefix(final byte[] seq1, final byte[] seq2, final int maxLength) { - if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null"); - if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null"); - if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength); - - final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength)); - for ( int i = 0; i < end; i++ ) { - if ( seq1[i] != seq2[i] ) - return i; - } - return end; - } - - /** - * Get the length of the longest common suffix of seq1 and seq2 - * @param seq1 non-null byte array - * @param seq2 non-null byte array - * @param maxLength the maximum allowed length to return - * @return the length of the longest common suffix of seq1 and seq2, >= 0 - */ - public static int longestCommonSuffix(final byte[] seq1, final byte[] seq2, final int maxLength) { - if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null"); - if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null"); - if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength); - - final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength)); - for ( int i = 0; i < end; i++ ) { - if ( seq1[seq1.length - i - 1] != seq2[seq2.length - i - 1] ) - return i; - } - return end; - } - - /** - * Trim any number of bases from the front and/or back of an array - * - * @param seq the sequence to trim - * @param trimFromFront how much to trim from the front - * @param trimFromBack how much to trim from the back - * @return a non-null array; can be the original array (i.e. not a copy) - */ - public static byte[] trimArray(final byte[] seq, final int trimFromFront, final int trimFromBack) { - if ( trimFromFront + trimFromBack > seq.length ) - throw new IllegalArgumentException("trimming total is larger than the original array"); - - // don't perform array copies if we need to copy everything anyways - return ( trimFromFront == 0 && trimFromBack == 0 ) ? seq : Arrays.copyOfRange(seq, trimFromFront, seq.length - trimFromBack); - } - - /** - * Simple wrapper for sticking elements of a int[] array into a List - * @param ar - the array whose elements should be listified - * @return - a List where each element has the same value as the corresponding index in @ar - */ - public static List listFromPrimitives(final int[] ar) { - final ArrayList lst = new ArrayList<>(ar.length); - for ( final int d : ar ) { - lst.add(d); - } - - return lst; - } - - /** - * Compares sections from to byte arrays to verify whether they contain the same values. - * - * @param left first array to compare. - * @param leftOffset first position of the first array to compare. - * @param right second array to compare. - * @param rightOffset first position of the second array to compare. - * @param length number of positions to compare. - * - * @throws IllegalArgumentException if

    - *
  • either {@code left} or {@code right} is {@code null} or
  • - *
  • any off the offset or length combine point outside any of the two arrays
  • - *
- * @return {@code true} iff {@code length} is 0 or all the bytes in both ranges are the same two-by-two. - */ - public static boolean equalRange(final byte[] left, final int leftOffset, byte[] right, final int rightOffset, final int length) { - if (left == null) throw new IllegalArgumentException("left cannot be null"); - if (right == null) throw new IllegalArgumentException("right cannot be null"); - if (length < 0) throw new IllegalArgumentException("the length cannot be negative"); - if (leftOffset < 0) throw new IllegalArgumentException("left offset cannot be negative"); - if (leftOffset + length > left.length) throw new IllegalArgumentException("length goes beyond end of left array"); - if (rightOffset < 0) throw new IllegalArgumentException("right offset cannot be negative"); - if (rightOffset + length > right.length) throw new IllegalArgumentException("length goes beyond end of right array"); - - for (int i = 0; i < length; i++) - if (left[leftOffset + i] != right[rightOffset + i]) - return false; - return true; - } - - /** - * Skims out positions of an array returning a shorter one with the remaning positions in the same order. - * @param original the original array to splice. - * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), - * or retained ({@code false}) - * - * @param the array type. - * - * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, - * or {@code remove length is different to {@code original}'s}, or {@code original} is not in - * fact an array. - * - * @return never {@code null}. - */ - public static T skimArray(final T original, final boolean[] remove) { - return skimArray(original,0,null,0,remove,0); - } - - /** - * Skims out positions of an array returning a shorter one with the remaning positions in the same order. - * - *

- * If the {@code dest} array provide is not long enough a new one will be created and returned with the - * same component type. All elements before {@code destOffset} will be copied from the input to the - * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where - * the position preceding {@code destOffset} will be left with the default value. The component type - * Will match the one of the {@code source} array. - *

- * - * @param source the original array to splice. - * @param sourceOffset the first position to skim. - * @param dest the destination array. - * @param destOffset the first position where to copy the skimed array values. - * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), - * or retained ({@code false}) - * @param removeOffset the first position in the remove index array to consider. - * - * @param the array type. - * - * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, - * or {@code remove length is different to {@code original}'s}, or {@code original} is not in - * fact an array. - * - * @return never {@code null}. - */ - public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, final boolean[] remove, final int removeOffset) { - if (source == null) - throw new IllegalArgumentException("the source array cannot be null"); - @SuppressWarnings("unchecked") - final Class sourceClazz = (Class) source.getClass(); - - if (!sourceClazz.isArray()) - throw new IllegalArgumentException("the source array is not in fact an array instance"); - final int length = Array.getLength(source) - sourceOffset; - if (length < 0) - throw new IllegalArgumentException("the source offset goes beyond the source array length"); - return skimArray(source,sourceOffset,dest,destOffset,remove,removeOffset,length); - } - - /** - * Skims out positions of an array returning a shorter one with the remaning positions in the same order. - * - *

- * If the {@code dest} array provide is not long enough a new one will be created and returned with the - * same component type. All elements before {@code destOffset} will be copied from the input to the - * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where - * the position preceding {@code destOffset} will be left with the default value. The component type - * Will match the one of the {@code source} array. - *

- * - * @param source the original array to splice. - * @param sourceOffset the first position to skim. - * @param dest the destination array. - * @param destOffset the first position where to copy the skimed array values. - * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), - * or retained ({@code false}) - * @param removeOffset the first position in the remove index array to consider. - * @param length the total number of position in {@code source} to consider. Thus only the {@code sourceOffset} to - * {@code sourceOffset + length - 1} region will be skimmed. - * - * @param the array type. - * - * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, - * or {@code remove length is different to {@code original}'s}, or {@code original} is not in - * fact an array. - * - * @return never {@code null}. - */ - public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, - final boolean[] remove, final int removeOffset, final int length) { - if (source == null) - throw new IllegalArgumentException("the source array cannot be null"); - if (remove == null) - throw new IllegalArgumentException("the remove array cannot be null"); - if (sourceOffset < 0) - throw new IllegalArgumentException("the source array offset cannot be negative"); - if (destOffset < 0) - throw new IllegalArgumentException("the destination array offset cannot be negative"); - if (removeOffset < 0) - throw new IllegalArgumentException("the remove array offset cannot be negative"); - if (length < 0) - throw new IllegalArgumentException("the length provided cannot be negative"); - - final int removeLength = Math.min(remove.length - removeOffset,length); - - if (removeLength < 0) - throw new IllegalArgumentException("the remove offset provided falls beyond the remove array end"); - - - @SuppressWarnings("unchecked") - final Class sourceClazz = (Class) source.getClass(); - - if (!sourceClazz.isArray()) - throw new IllegalArgumentException("the source array is not in fact an array instance"); - - final Class destClazz = skimArrayDetermineDestArrayClass(dest, sourceClazz); - - final int sourceLength = Array.getLength(source); - - if (sourceLength < length + sourceOffset) - throw new IllegalArgumentException("the source array is too small considering length and offset"); - - // count how many positions are to be removed. - - int removeCount = 0; - - final int removeEnd = removeLength + removeOffset; - for (int i = removeOffset; i < removeEnd; i++) - if (remove[i]) removeCount++; - - - final int newLength = length - removeCount; - - - @SuppressWarnings("unchecked") - final T result = skimArrayBuildResultArray(dest, destOffset, destClazz, newLength); - // No removals, just copy the whole thing. - - if (removeCount == 0) - System.arraycopy(source,sourceOffset,result,destOffset,length); - else if (length > 0) { // if length == 0 nothing to do. - int nextOriginalIndex = 0; - int nextNewIndex = 0; - int nextRemoveIndex = removeOffset; - while (nextOriginalIndex < length && nextNewIndex < newLength) { - while (nextRemoveIndex < removeEnd && remove[nextRemoveIndex++]) { nextOriginalIndex++; } // skip positions to be spliced. - // Since we make the nextNewIndex < newLength check in the while condition - // there is no need to include the following break, as is guaranteed not to be true: - // if (nextOriginalIndex >= length) break; // we reach the final (last positions are to be spliced. - final int copyStart = nextOriginalIndex; - while (++nextOriginalIndex < length && (nextRemoveIndex >= removeEnd || !remove[nextRemoveIndex])) { nextRemoveIndex++; } - final int copyEnd = nextOriginalIndex; - final int copyLength = copyEnd - copyStart; - System.arraycopy(source, sourceOffset + copyStart, result, destOffset + nextNewIndex, copyLength); - nextNewIndex += copyLength; - } - } - return result; - } - - private static T skimArrayBuildResultArray(final T dest, final int destOffset, final Class destClazz, final int newLength) { - @SuppressWarnings("unchecked") - final T result; - - if (dest == null) - result = (T) Array.newInstance(destClazz.getComponentType(), newLength + destOffset); - else if (Array.getLength(dest) < newLength + destOffset) { - result = (T) Array.newInstance(destClazz.getComponentType(),newLength + destOffset); - if (destOffset > 0) System.arraycopy(dest,0,result,0,destOffset); - } else - result = dest; - return result; - } - - private static Class skimArrayDetermineDestArrayClass(final T dest, Class sourceClazz) { - final Class destClazz; - if (dest == null) - destClazz = sourceClazz; - else { - destClazz = (Class) dest.getClass(); - if (destClazz != sourceClazz) { - if (!destClazz.isArray()) - throw new IllegalArgumentException("the destination array class must be an array"); - if (sourceClazz.getComponentType().isAssignableFrom(destClazz.getComponentType())) - throw new IllegalArgumentException("the provided destination array class cannot contain values from the source due to type incompatibility"); - } - } - return destClazz; - } - - /** - * Makes a deep clone of the array provided. - * - *

- * When you can use {@link Arrays#copyOf} or an array {@link Object#clone()} to create a copy of itself, - * if it is multi-dimentional each sub array or matrix would be cloned. - *

- * - *

- * Notice however that if the base type is an Object type, the base elements themselves wont be cloned. - *

- * - * @param array the array to deep-clone. - * @param type of the array. - * - * @throws IllegalArgumentException if {@code array} is {@code null} or is not an array. - */ - public static T deepCloneArray(final T array) { - - if (array == null) - throw new IllegalArgumentException(""); - - @SuppressWarnings("unchecked") - final Class clazz = (Class) array.getClass(); - - - if (!clazz.isArray()) - throw new IllegalArgumentException("the input is not an array"); - - final int dimension = calculateArrayDimensions(clazz); - - return deepCloneArrayUnchecked(array,clazz, dimension); - } - - private static int calculateArrayDimensions(final Class clazz) { - if (clazz.isArray()) - return calculateArrayDimensions(clazz.getComponentType()) + 1; - else - return 0; - } - - private static T deepCloneArrayUnchecked(final T array, final Class clazz, final int dimension) { - - - final int length = Array.getLength(array); - - final Class componentClass = clazz.getComponentType(); - - final T result = (T) Array.newInstance(componentClass,length); - - if (dimension <= 1) { - System.arraycopy(array, 0, result, 0, length); - return result; - } - - - final int dimensionMinus1 = dimension - 1; - - for (int i = 0; i < length; i++) - Array.set(result,i,deepCloneArrayUnchecked(Array.get(array,i),componentClass,dimensionMinus1)); - - return result; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/BAQReadTransformer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/BAQReadTransformer.java deleted file mode 100644 index c9192e12f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/BAQReadTransformer.java +++ /dev/null @@ -1,74 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.baq; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.WalkerManager; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.walkers.BAQMode; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -/** - * Applies Heng's BAQ calculation to a stream of incoming reads - */ -public class BAQReadTransformer extends ReadTransformer { - private BAQ baqHMM; - private IndexedFastaSequenceFile refReader; - private BAQ.CalculationMode cmode; - private BAQ.QualityMode qmode; - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { - final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); - this.refReader = engine.getReferenceDataSource().getReference(); - this.cmode = engine.getArguments().BAQMode; - this.qmode = mode.QualityMode(); - baqHMM = new BAQ(engine.getArguments().BAQGOP); - - if ( qmode == BAQ.QualityMode.DONT_MODIFY ) - throw new ReviewedGATKException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); - - if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) - throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); - - return mode.ApplicationTime(); - } - - @Override - public boolean enabled() { - return cmode != BAQ.CalculationMode.OFF; - } - - @Override - public GATKSAMRecord apply(final GATKSAMRecord read) { - baqHMM.baqRead(read, refReader, cmode, qmode); - return read; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/ReadTransformingIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/ReadTransformingIterator.java deleted file mode 100644 index 18ca02f61..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/ReadTransformingIterator.java +++ /dev/null @@ -1,69 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.baq; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - -/** - * Iterator that applies a ReadTransformer to a stream of reads - */ -public class ReadTransformingIterator implements GATKSAMIterator { - private final GATKSAMIterator it; - private final ReadTransformer transformer; - - /** - * Creates a new ReadTransforming iterator - */ - @Requires({"it != null", "transformer != null", "transformer.isInitialized()"}) - public ReadTransformingIterator(final GATKSAMIterator it, final ReadTransformer transformer) { - if ( ! transformer.isInitialized() ) - throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); - if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN ) - throw new IllegalStateException("Creating a read transformer stream for a forbidden transformer " + transformer); - - this.it = it; - this.transformer = transformer; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - final GATKSAMRecord read = (GATKSAMRecord)it.next(); - return transformer.apply(read); - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/PluginManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/PluginManager.java deleted file mode 100644 index 7313e19e6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/PluginManager.java +++ /dev/null @@ -1,355 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.classloader; - -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.WalkerManager; -import org.broadinstitute.gatk.engine.filters.FilterManager; -import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.reflections.Reflections; -import org.reflections.scanners.SubTypesScanner; -import org.reflections.util.ConfigurationBuilder; - -import java.io.File; -import java.lang.reflect.Constructor; -import java.lang.reflect.Method; -import java.net.MalformedURLException; -import java.net.URL; -import java.net.URLClassLoader; -import java.util.*; - -/** - * Manage plugins and plugin configuration. - * @author mhanna - * @version 0.1 - */ -public class PluginManager { - /** - * A reference into our introspection utility. - */ - private static final Reflections defaultReflections; - - static { - // turn off logging in the reflections library - they talk too much - Reflections.log = null; - - Set classPathUrls = new LinkedHashSet(); - - URL cwd; - try { - cwd = new File(".").getAbsoluteFile().toURI().toURL(); - } catch (MalformedURLException e) { - throw new RuntimeException(e); - } - - // NOTE: Reflections also scans directories for classes. - // Meanwhile some of the jar MANIFEST.MF Bundle-ClassPath properties contain "." - // Do NOT let reflections scan the CWD where it often picks up test classes when - // they weren't explicitly in the classpath, for example the UninstantiableWalker - for (URL url: JVMUtils.getClasspathURLs()) - if (!url.equals(cwd)) - classPathUrls.add(url); - - defaultReflections = new Reflections( new ConfigurationBuilder() - .setUrls(classPathUrls) - .setScanners(new SubTypesScanner())); - } - - /** - * Defines the category of plugin defined by the subclass. - */ - protected final String pluginCategory; - - /** - * Define common strings to trim off the end of the name. - */ - protected final String pluginSuffix; - - /** - * Plugins stored based on their name. - */ - private final SortedMap> pluginsByName; - - private final List> plugins; - private final List> interfaces; - - /** - * Create a new plugin manager. - * @param pluginType Core type for a plugin. - */ - public PluginManager(Class pluginType) { - this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), null); - } - - /** - * Create a new plugin manager. - * @param pluginType Core type for a plugin. - * @param classpath Custom class path to search for classes. - */ - public PluginManager(Class pluginType, List classpath) { - this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), classpath); - } - - /** - * Create a new plugin manager. - * @param pluginType Core type for a plugin. - * @param pluginCategory Provides a category name to the plugin. Must not be null. - * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. - */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { - this(pluginType, pluginCategory, pluginSuffix, null); - } - - /** - * Create a new plugin manager. - * @param pluginType Core type for a plugin. - * @param pluginCategory Provides a category name to the plugin. Must not be null. - * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. - * @param classpath Custom class path to search for classes. - */ - public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { - this.pluginCategory = pluginCategory; - this.pluginSuffix = pluginSuffix; - - this.plugins = new ArrayList>(); - this.interfaces = new ArrayList>(); - - Reflections reflections; - if (classpath == null) { - reflections = defaultReflections; - } else { - addClasspath(classpath); - reflections = new Reflections( new ConfigurationBuilder() - .setUrls(classpath) - .setScanners(new SubTypesScanner())); - } - - // Load all classes types filtering them by concrete. - @SuppressWarnings("unchecked") - Set> allTypes = reflections.getSubTypesOf(pluginType); - for( Class type: allTypes ) { - // The plugin manager does not support anonymous classes; to be a plugin, a class must have a name. - if(JVMUtils.isAnonymous(type)) - continue; - - if( JVMUtils.isConcrete(type) ) - plugins.add(type); - else - interfaces.add(type); - } - - pluginsByName = new TreeMap>(); - for (Class pluginClass : plugins) { - String pluginName = getName(pluginClass); - pluginsByName.put(pluginName, pluginClass); - } - - // sort the plugins so the order of elements is deterministic - sortPlugins(plugins); - sortPlugins(interfaces); - } - - /** - * Sorts, in place, the list of plugins according to getName() on each element - * - * @param unsortedPlugins unsorted plugins - */ - private void sortPlugins(final List> unsortedPlugins) { - Collections.sort(unsortedPlugins, new ComparePluginsByName()); - } - - private final class ComparePluginsByName implements Comparator> { - @Override - public int compare(final Class aClass, final Class aClass1) { - String pluginName1 = getName(aClass); - String pluginName2 = getName(aClass1); - return pluginName1.compareTo(pluginName2); - } - } - - /** - * Adds the URL to the system class loader classpath using reflection. - * HACK: Uses reflection to modify the class path, and assumes loader is a URLClassLoader. - * @param urls URLs to add to the system class loader classpath. - */ - private static void addClasspath(List urls) { - Set existing = JVMUtils.getClasspathURLs(); - for (URL url : urls) { - if (existing.contains(url)) - continue; - try { - Method method = URLClassLoader.class.getDeclaredMethod("addURL", URL.class); - if (!method.isAccessible()) - method.setAccessible(true); - method.invoke(ClassLoader.getSystemClassLoader(), url); - } catch (Exception e) { - throw new ReviewedGATKException("Error adding url to the current classloader.", e); - } - } - } - - public Map> getPluginsByName() { - return Collections.unmodifiableMap(pluginsByName); - } - - /** - * Does a plugin with the given name exist? - * - * @param pluginName Name of the plugin for which to search. - * @return True if the plugin exists, false otherwise. - */ - public boolean exists(String pluginName) { - return pluginsByName.containsKey(pluginName); - } - - /** - * Does a plugin with the given name exist? - * - * @param plugin Name of the plugin for which to search. - * @return True if the plugin exists, false otherwise. - */ - public boolean exists(Class plugin) { - return pluginsByName.containsValue(plugin); - } - - /** - * Returns the plugin classes - * @return the plugin classes - */ - public List> getPlugins() { - return plugins; - } - - /** - * Returns the interface classes - * @return the interface classes - */ - public List> getInterfaces() { - return interfaces; - } - - /** - * Returns the plugin classes implementing interface or base clase - * @param type type of interface or base class - * @return the plugin classes implementing interface or base class - */ - public List> getPluginsImplementing(Class type) { - List> implementing = new ArrayList>(); - for (Class plugin: getPlugins()) - if (type.isAssignableFrom(plugin)) - implementing.add(plugin); - return implementing; - } - - - - /** - * Gets a plugin with the given name - * - * @param pluginName Name of the plugin to retrieve. - * @return The plugin object if found; null otherwise. - */ - public PluginType createByName(String pluginName) { - Class plugin = pluginsByName.get(pluginName); - if( plugin == null ) { - String errorMessage = formatErrorMessage(pluginCategory,pluginName); - if ( this.getClass().isAssignableFrom(FilterManager.class) ) { - throw new UserException.MalformedReadFilterException(errorMessage); - } else if ( this.getClass().isAssignableFrom(WalkerManager.class) ) { - throw new UserException.MalformedWalkerArgumentsException(errorMessage); - } else { - throw new UserException.CommandLineException(errorMessage); - } - } - try { - return plugin.newInstance(); - } catch (Exception e) { - throw new DynamicClassResolutionException(plugin, e); - } - } - - /** - * create a plugin with the given type - * - * @param pluginType type of the plugin to create. - * @return The plugin object if created; null otherwise. - */ - public PluginType createByType(Class pluginType) { - Logger logger = Logger.getLogger(PluginManager.class); - logger.setLevel(Level.ERROR); - try { - Constructor noArgsConstructor = pluginType.getDeclaredConstructor((Class[])null); - noArgsConstructor.setAccessible(true); - return noArgsConstructor.newInstance(); - } catch (Exception e) { - logger.error("Couldn't initialize the plugin. Typically this is because of wrong global class variable initializations."); - throw new DynamicClassResolutionException(pluginType, e); - } - } - - /** - * Returns concrete instances of the plugins - * @return concrete instances of the plugins - */ - public List createAllTypes() { - List instances = new ArrayList(); - for ( Class c : getPlugins() ) { - instances.add(createByType(c)); - } - return instances; - } - - /** - * Create a name for this type of plugin. - * - * @param pluginType The type of plugin. - * @return A name for this type of plugin. - */ - public String getName(Class pluginType) { - String pluginName = ""; - - if (pluginName.length() == 0) { - pluginName = pluginType.getSimpleName(); - if (pluginSuffix != null && pluginName.endsWith(pluginSuffix)) - pluginName = pluginName.substring(0, pluginName.lastIndexOf(pluginSuffix)); - } - - return pluginName; - } - - /** - * Generate the error message for the plugin manager. The message is allowed to depend on the class. - * @param pluginCategory - string, the category of the plugin (e.g. read filter) - * @param pluginName - string, what we were trying to match (but failed to) - * @return error message text describing the error - */ - protected String formatErrorMessage(String pluginCategory, String pluginName ) { - return String.format("Could not find %s with name: %s", pluginCategory,pluginName); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleCodec.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleCodec.java deleted file mode 100644 index 915900388..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleCodec.java +++ /dev/null @@ -1,276 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.codecs.beagle; -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - - -import htsjdk.tribble.AsciiFeatureCodec; -import htsjdk.tribble.exception.CodecLineParsingException; -import htsjdk.tribble.readers.LineIterator; -import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.io.IOException; -import java.util.*; -import java.util.regex.Pattern; - -/** - * Codec for Beagle imputation engine - * - *

- * Reads in tabular files with site markers and genotype posteriors, genotypes and phasing that Beagle produced - *

- * - *

- * See also: @see BEAGLE home page
- *

- - *

- * - *

File format example for phased genotypes file

- *
- *     dummy header
- *      20:60251 T T T T T T
- *      20:60321 G G G G G G
- *      20:60467 G G G G G G
- * 
- * - *

File format example for genotype posteriors

- *
- *     marker alleleA alleleB NA07056 NA07056 NA07056
- *     20:60251 T C 0.9962 0.0038 0 0.99245 0.00755 0 0.99245 0.00755 0
- *     20:60321 G T 0.98747 0.01253 0 0.99922 0.00078 0 0.99368 0.00632 0
- *     20:60467 G C 0.97475 0.02525 0 0.98718 0.01282 0 0.98718 0.01282 0
- * 
- * - *

File format example for r2 file - *
- *      20:60251        0.747
- *      20:60321        0.763
- *      20:60467        0.524
- * 
- *

- * @author Mark DePristo - * @since 2010 - */ -public class BeagleCodec extends AsciiFeatureCodec implements ReferenceDependentFeatureCodec { - private String[] header; - public enum BeagleReaderType {PROBLIKELIHOOD, GENOTYPES, R2}; - private BeagleReaderType readerType; - private int valuesPerSample; - private int initialSampleIndex; - private int markerPosition; - private ArrayList sampleNames; - private int expectedTokensPerLine; - private final static Set HEADER_IDs = new HashSet(Arrays.asList("marker", "I")); - - private static final String delimiterRegex = "\\s+"; - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - - public BeagleCodec() { - super(BeagleFeature.class); - } - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - @Override - public Object readActualHeader(LineIterator reader) { - int[] lineCounter = new int[1]; - try { - header = readHeader(reader, lineCounter); - - Boolean getSamples = true; - markerPosition = 0; //default value for all readers - - if (header[0].matches("I")) { - // Phased genotype Beagle files start with "I" - readerType = BeagleReaderType.GENOTYPES; - valuesPerSample = 2; - initialSampleIndex = 2; - markerPosition = 1; - } - else if (header[0].matches("marker")) { - readerType = BeagleReaderType.PROBLIKELIHOOD; - valuesPerSample = 3; - initialSampleIndex = 3; - } - else { - readerType = BeagleReaderType.R2; - getSamples = false; - // signal we don't have a header - lineCounter[0] = 0; - // not needed, but for consistency: - valuesPerSample = 0; - initialSampleIndex = 0; - } - - sampleNames = new ArrayList(); - - if (getSamples) { - for (int k = initialSampleIndex; k < header.length; k += valuesPerSample) - sampleNames.add(header[k]); - - expectedTokensPerLine = sampleNames.size()*valuesPerSample+initialSampleIndex; - - } else { - expectedTokensPerLine = 2; - } - - - } catch(IOException e) { - throw new IllegalArgumentException("Unable to read from file.", e); - } - return header; - } - - private static String[] readHeader(final LineIterator source, int[] lineCounter) throws IOException { - - String[] header = null; - int numLines = 0; - - //find the 1st line that's non-empty and not a comment - while(source.hasNext()) { - final String line = source.next(); - numLines++; - if ( line.trim().isEmpty() ) { - continue; - } - - //parse the header - header = line.split(delimiterRegex); - break; - } - - // check that we found the header - if ( header == null ) { - throw new IllegalArgumentException("No header in " + source); - } - - if(lineCounter != null) { - lineCounter[0] = numLines; - } - - return header; - } - - private static Pattern MARKER_PATTERN = Pattern.compile("(.+):([0-9]+)"); - - public BeagleFeature decode(String line) { - String[] tokens; - - // split the line - tokens = line.split(delimiterRegex); - if (tokens.length != expectedTokensPerLine) - throw new CodecLineParsingException("Incorrect number of fields in Beagle input on line "+line); - - if ( HEADER_IDs.contains(tokens[0]) ) - return null; - - BeagleFeature bglFeature = new BeagleFeature(); - - final GenomeLoc loc = genomeLocParser.parseGenomeLoc(tokens[markerPosition]); //GenomeLocParser.parseGenomeLoc(values.get(0)); - TODO switch to this - - //parse the location: common to all readers - bglFeature.setChr(loc.getContig()); - bglFeature.setStart((int) loc.getStart()); - bglFeature.setEnd((int) loc.getStop()); - - // Parse R2 if needed - if (readerType == BeagleReaderType.R2) { - bglFeature.setR2value(Double.valueOf(tokens[1])); - } - else if (readerType == BeagleReaderType.GENOTYPES) { - // read phased Genotype pairs - HashMap> sampleGenotypes = new HashMap>(); - - for ( int i = 2; i < tokens.length; i+=2 ) { - String sampleName = sampleNames.get(i/2-1); - if ( ! sampleGenotypes.containsKey(sampleName) ) { - sampleGenotypes.put(sampleName, new ArrayList()); - } - - sampleGenotypes.get(sampleName).add(tokens[i]); - sampleGenotypes.get(sampleName).add(tokens[i+1]); - } - - bglFeature.setGenotypes(sampleGenotypes); - } - else { - // read probabilities/likelihood trios and alleles - bglFeature.setAlleleA(tokens[1], true); - bglFeature.setAlleleB(tokens[2], false); - HashMap> sampleProbLikelihoods = new HashMap>(); - - for ( int i = 3; i < tokens.length; i+=3 ) { - String sampleName = sampleNames.get(i/3-1); - if ( ! sampleProbLikelihoods.containsKey(sampleName) ) { - sampleProbLikelihoods.put(sampleName, new ArrayList()); - } - - sampleProbLikelihoods.get(sampleName).add(tokens[i]); - sampleProbLikelihoods.get(sampleName).add(tokens[i+1]); - sampleProbLikelihoods.get(sampleName).add(tokens[i+2]); - } - bglFeature.setProbLikelihoods(sampleProbLikelihoods); - } - - return bglFeature; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqCodec.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqCodec.java deleted file mode 100644 index 9d60076ca..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqCodec.java +++ /dev/null @@ -1,171 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.codecs.refseq; - -import htsjdk.tribble.AsciiFeatureCodec; -import htsjdk.tribble.Feature; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.readers.LineIterator; -import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.ArrayList; - -/** - * Allows for reading in RefSeq information - * - *

- * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, - * strandedness of transcription. - *

- * - *

- * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the documentation guide here - * http://www.broadinstitute.org/gatk/guide/article?id=1329 - *

- *

Usage

- * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example - *
- * -refSeqBinding:REFSEQ /path/to/refSeq.txt
- * 
- * - * You will need to consult individual walkers for the binding name ("refSeqBinding", above) - * - *

File format example

- * If you want to define your own file for use, the format is (tab delimited): - * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) - * and exon frames, for example: - *
- * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
- * 
- * for more information see here - *

- * - *

- * - * @author Mark DePristo - * @since 2010 - */ -public class RefSeqCodec extends AsciiFeatureCodec implements ReferenceDependentFeatureCodec { - - /** - * The parser to use when resolving genome-wide locations. - */ - private GenomeLocParser genomeLocParser; - private boolean zero_coding_length_user_warned = false; - - public RefSeqCodec() { - super(RefSeqFeature.class); - } - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - @Override - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - @Override - public Feature decodeLoc(final LineIterator lineIterator) { - final String line = lineIterator.next(); - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); - String contig_name = fields[2]; - try { - return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - } catch ( UserException.MalformedGenomeLoc e ) { - Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); - return null; - } catch ( NumberFormatException e ) { - throw new UserException.MalformedFile("Could not parse location from line: " + line); - } - } - - /** Fills this object from a text line in RefSeq (UCSC) text dump file */ - @Override - public RefSeqFeature decode(String line) { - if (line.startsWith("#")) return null; - String fields[] = line.split("\t"); - - // we reference postion 15 in the split array below, make sure we have at least that many columns - if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); - String contig_name = fields[2]; - RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - - feature.setTranscript_id(fields[1]); - if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); - else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); - else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); - - int coding_start = Integer.parseInt(fields[6])+1; - int coding_stop = Integer.parseInt(fields[7]); - - if ( coding_start > coding_stop ) { - if ( ! zero_coding_length_user_warned ) { - Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+ - "Such transcripts will be ignored (this warning is printed only once)"); - zero_coding_length_user_warned = true; - } - return null; - } - - feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); - feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop)); - feature.setGene_name(fields[12]); - String[] exon_starts = fields[9].split(","); - String[] exon_stops = fields[10].split(","); - String[] eframes = fields[15].split(","); - - if ( exon_starts.length != exon_stops.length ) - throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); - if ( exon_starts.length != eframes.length ) - throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); - - ArrayList exons = new ArrayList(exon_starts.length); - ArrayList exon_frames = new ArrayList(eframes.length); - - for ( int i = 0 ; i < exon_starts.length ; i++ ) { - exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); - exon_frames.add(Integer.decode(eframes[i])); - } - - feature.setExons(exons); - feature.setExon_frames(exon_frames); - return feature; - } - - @Override - public Object readActualHeader(LineIterator lineIterator) { - // No header for this format - return null; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqFeature.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqFeature.java deleted file mode 100644 index 226a35307..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqFeature.java +++ /dev/null @@ -1,323 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.codecs.refseq; - -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * the ref seq feature - */ -public class RefSeqFeature implements Transcript, Feature { - - private String transcript_id; - private int strand; - private GenomeLoc transcript_interval; - private GenomeLoc transcript_coding_interval; - private List exons; - private String gene_name; - private List exon_frames; - private String name; - - public RefSeqFeature(GenomeLoc genomeLoc) { - this.transcript_interval = genomeLoc; - } - - /** Returns id of the transcript (RefSeq NM_* id) */ - public String getTranscriptId() { return transcript_id; } - - /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ - public int getStrand() { return strand; } - - /** Returns transcript's full genomic interval (includes all exons with UTRs) */ - public GenomeLoc getLocation() { - return transcript_interval; - } - - /** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */ - public GenomeLoc getCodingLocation() { return transcript_coding_interval; } - - /** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */ - public String getGeneName() { return gene_name; } - - /** Number of exons in this transcript */ - public int getNumExons() { return exons.size(); } - - /** Genomic location of the n-th exon; throws an exception if n is out of bounds */ - public GenomeLoc getExonLocation(int n) { - if ( n >= exons.size() || n < 0 ) throw new ReviewedGATKException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n); - return exons.get(n); - } - - /** Returns the list of all exons in this transcript, as genomic intervals */ - public List getExons() { return exons; } - - /** Returns all exons falling ::entirely:: inside an interval **/ - public List getExonsInInterval( GenomeLoc interval ) { - List relevantExons = new ArrayList(exons.size()); - for ( GenomeLoc exon : getExons() ) { - if ( interval.containsP(exon) ) { - relevantExons.add(exon); - } - } - - return relevantExons; - } - - /** convenience method; returns the numbers of the exons in the interval **/ - public List getExonNumbersInInterval( GenomeLoc interval ) { - List numbers = new ArrayList(); - int iNo = 0; - for ( GenomeLoc exon : getExons() ) { - if ( interval.containsP(exon) ) { - numbers.add(iNo); - } - iNo++; - } - - return numbers; - } - - public String getTranscriptUniqueGeneName() { - return String.format("%s(%s)",getGeneName(),getTranscriptId()); - } - - public String getOverlapString(GenomeLoc position) { - boolean is_exon = false; - StringBuilder overlapString = new StringBuilder(); - int exonNo = 1; - - for ( GenomeLoc exon : exons ) { - if ( exon.containsP(position) ) { - overlapString.append(String.format("exon_%d",exonNo)); - is_exon = true; - break; - } - exonNo ++; - } - - if ( ! is_exon ) { - if ( overlapsCodingP(position) ) { - overlapString.append("Intron"); - } else { - overlapString.append("UTR"); - } - } - - return overlapString.toString(); - } - - ArrayList exonInRefOrderCache = null; - - public Integer getSortedOverlapInteger(GenomeLoc position) { - int exonNo = -1; - ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); - if ( exonInRefOrderCache == null ) { - Collections.sort(exonsInReferenceOrder); - } - exonInRefOrderCache = exonsInReferenceOrder; - for ( GenomeLoc exon : exonsInReferenceOrder ) { - if ( exon.overlapsP(position) ) { - return ++exonNo; - } - ++exonNo; - } - - return -1; - } - - public GenomeLoc getSortedExonLoc(int offset) { - ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); - if ( exonInRefOrderCache == null ) { - Collections.sort(exonsInReferenceOrder); - } - exonInRefOrderCache = exonsInReferenceOrder; - return exonsInReferenceOrder.get(offset); - } - - /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ - public boolean overlapsP (GenomeLoc that) { - return getLocation().overlapsP(that); - } - - /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. - * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, - * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. - * @see #overlapsExonP - */ - public boolean overlapsCodingP (GenomeLoc that) { - return transcript_coding_interval.overlapsP(that); - } - - /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ - public boolean overlapsExonP (GenomeLoc that) { - for ( GenomeLoc e : exons ) { - if ( e.overlapsP(that) ) return true; - } - return false; - } - public String toString() { - StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something - b.append(transcript_id); // #1 - b.append('\t'); - b.append(getLocation().getContig()); // #2 - b.append('\t'); - b.append( (strand==1?'+':'-') ); // #3 - b.append('\t'); - b.append( (getLocation().getStart() - 1) ); // #4 - b.append('\t'); - b.append( getLocation().getStop()); // #5 - b.append('\t'); - b.append( (transcript_coding_interval.getStart() - 1) ); // #6 - b.append('\t'); - b.append( transcript_coding_interval.getStop()); // #7 - b.append('\t'); - b.append(exons.size()); // #8 - b.append('\t'); - for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9 - b.append('\t'); - for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10 - b.append("\t0\t"); // # 11 - unused? - b.append(gene_name); // # 12 - b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused? - for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15 - - - return b.toString(); - } - - /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to - * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether - * this position is fully whithin an exon of any of those transcripts. Passing null is safe (will return false). - * NOTE: position can be still within a UTR, see #isCoding - * @return true if it's an exon - */ - public static boolean isExon(RODRecordList l) { - - if ( l == null ) return false; - - GenomeLoc loc = l.getLocation(); - - for ( GATKFeature t : l ) { - if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; - } - return false; - - } - - /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to - * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether - * this position is fully whithin a coding region of any of those transcripts. - * Passing null is safe (will return false). - * NOTE: "coding" interval is defined as a single genomic interval, so it - * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a - * transcript, or internal exons that are not spliced into a given transcript. To check that a position is - * indeed within an exon but not in UTR, use #isCodingExon(). - * @return - */ - public static boolean isCoding(RODRecordList l) { - - if ( l == null ) return false; - - GenomeLoc loc = l.getLocation(); - - for ( GATKFeature t : l ) { - if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true; - } - return false; - - } - - /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to - * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether - * this position is fully whithin a coding exon portion (i.e. true coding sequence) of any of those transcripts. - * Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript, - * for which the current position is within an exon and within a coding interval simultaneously. - * @return - */ - public static boolean isCodingExon(RODRecordList l) { - - if ( l == null ) return false; - - GenomeLoc loc = l.getLocation(); - - for ( GATKFeature t : l ) { - if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) && ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; - } - return false; - - } - - - public void setTranscript_id(String transcript_id) { - this.transcript_id = transcript_id; - } - - public void setStrand(int strand) { - this.strand = strand; - } - - public void setTranscript_interval(GenomeLoc transcript_interval) { - this.transcript_interval = transcript_interval; - } - - public void setTranscript_coding_interval(GenomeLoc transcript_coding_interval) { - this.transcript_coding_interval = transcript_coding_interval; - } - - public void setExons(List exons) { - this.exons = exons; - } - - public void setGene_name(String gene_name) { - this.gene_name = gene_name; - } - - public void setExon_frames(List exon_frames) { - this.exon_frames = exon_frames; - } - - public void setName(String name) { - this.name = name; - } - - public String getChr() { - return transcript_interval.getContig(); - } - - public int getStart() { - return transcript_interval.getStart(); - } - - public int getEnd() { - return transcript_interval.getStop(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/BedTableCodec.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/BedTableCodec.java deleted file mode 100644 index 9a0115f66..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/BedTableCodec.java +++ /dev/null @@ -1,59 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.codecs.table; - -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec; - -import java.util.Arrays; - -/** - * The standard table codec that expects loci as contig start stop, not contig:start-stop - * - *

- * The standard table codec with a slightly different parsing convention - * (expects loci as contig start stop, not contig:start-stop) - *

- * - *

- * See also: TableCodec - *

- * - * @author Chris Hartl - * @since 2010 - */ -public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec { - - @Override - public TableFeature decode(String line) { - if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter) || line.startsWith(igvHeaderDelimiter)) - return null; - String[] split = line.split(delimiterRegex); - if (split.length < 1) - throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format"); - return new TableFeature(genomeLocParser.createGenomeLoc(split[0],Integer.parseInt(split[1])-1,Integer.parseInt(split[2])), Arrays.asList(split),header); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableCodec.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableCodec.java deleted file mode 100644 index 1058d3e90..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableCodec.java +++ /dev/null @@ -1,126 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.codecs.table; - -import htsjdk.tribble.AsciiFeatureCodec; -import htsjdk.tribble.readers.LineIterator; -import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; - -/** - * Reads tab deliminated tabular text files - * - *

- *

    - *
  • Header: must begin with line HEADER or track (for IGV), followed by any number of column names, - * separated by whitespace.
  • - *
  • Comment lines starting with # are ignored
  • - *
  • Each non-header and non-comment line is split into parts by whitespace, - * and these parts are assigned as a map to their corresponding column name in the header. - * Note that the first element (corresponding to the HEADER column) must be a valid genome loc - * such as 1, 1:1 or 1:1-10, which is the position of the Table element on the genome. TableCodec - * requires that there be one value for each column in the header, and no more, on all lines.
  • - *
- *

- * - *

- * - *

File format example

- *
- *     HEADER a b c
- *     1:1  1   2   3
- *     1:2  4   5   6
- *     1:3  7   8   9
- * 
- * - * @author Mark DePristo - * @since 2009 - */ -public class TableCodec extends AsciiFeatureCodec implements ReferenceDependentFeatureCodec { - final static protected String delimiterRegex = "\\s+"; - final static protected String headerDelimiter = "HEADER"; - final static protected String igvHeaderDelimiter = "track"; - final static protected String commentDelimiter = "#"; - - protected ArrayList header = new ArrayList(); - - /** - * The parser to use when resolving genome-wide locations. - */ - protected GenomeLocParser genomeLocParser; - - public TableCodec() { - super(TableFeature.class); - } - - /** - * Set the parser to use when resolving genetic data. - * @param genomeLocParser The supplied parser. - */ - @Override - public void setGenomeLocParser(GenomeLocParser genomeLocParser) { - this.genomeLocParser = genomeLocParser; - } - - @Override - public TableFeature decode(String line) { - if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter) || line.startsWith(igvHeaderDelimiter)) - return null; - String[] split = line.split(delimiterRegex); - if (split.length < 1) - throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format"); - return new TableFeature(genomeLocParser.parseGenomeLoc(split[0]),Arrays.asList(split), header); - } - - @Override - public Object readActualHeader(final LineIterator reader) { - boolean isFirst = true; - while (reader.hasNext()) { - final String line = reader.peek(); // Peek to avoid reading non-header data - if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) { - throw new UserException.MalformedFile("TableCodec file does not have a header"); - } - isFirst &= line.startsWith(commentDelimiter); - if (line.startsWith(headerDelimiter)) { - reader.next(); // "Commit" the peek - if (header.size() > 0) throw new IllegalStateException("Input table file seems to have two header lines. The second is = " + line); - final String spl[] = line.split(delimiterRegex); - Collections.addAll(header, spl); - return header; - } else if (line.startsWith(commentDelimiter)) { - reader.next(); // "Commit" the peek - } else { - break; - } - } - return header; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/RODMergingIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/RODMergingIterator.java deleted file mode 100644 index 7af62bd9e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/RODMergingIterator.java +++ /dev/null @@ -1,160 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.collections; - -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.Collection; -import java.util.Iterator; -import java.util.LinkedList; -import java.util.PriorityQueue; - -public class RODMergingIterator implements Iterator, Iterable { - PriorityQueue queue = new PriorityQueue(); - - private class Element implements Comparable { - public LocationAwareSeekableRODIterator it = null; - public GenomeLoc nextLoc = null; - - public Element(Iterator it) { - if ( it instanceof LocationAwareSeekableRODIterator) { - this.it = (LocationAwareSeekableRODIterator)it; - if ( ! it.hasNext() ) throw new ReviewedGATKException("Iterator is empty"); - update(); - } else { - throw new ReviewedGATKException("Iterator passed to RODMergingIterator is not LocationAwareSeekableRODIterator"); - } - } - - public Element update() { - // E prev = value; - nextLoc = it.peekNextLocation(); // will return null if there is no next location - return this; - } - - public int compareTo(Element other) { - if ( nextLoc == null ) { - if ( other.nextLoc != null ) return 1; // null means no more data available, so its after any non-null position - return 0; - } - if ( other.nextLoc == null ) return -1; // we can get to this point only if this.nextLoc != null - - return nextLoc.compareTo(other.nextLoc); - } - - public RODRecordList next() { - RODRecordList value = it.next(); - update(); - return value; - } - } - - public Iterator iterator() { - return this; - } - - public RODMergingIterator() { - ; - } - - public RODMergingIterator(Iterator it) { - add(it); - } - - public RODMergingIterator(Collection> its) { - for ( Iterator it : its ) { - add(it); - } - } - - /** If the iterator is non-empty (hasNext() is true), put it into the queue. The next location the iterator - * will be after a call to next() is peeked into and cached as queue's priority value. - * @param it - */ - public void add(Iterator it) { - if ( it.hasNext() ) - queue.add(new Element(it)); - } - - public boolean hasNext() { - return ! queue.isEmpty(); - } - - public RODRecordList next() { - Element e = queue.poll(); - RODRecordList value = e.next(); // next() will also update next location cached by the Element - - if ( e.nextLoc != null ) // we have more data in the track - queue.add(e); // add the element back to queue (note: its next location, on which priority is based, was updated - - //System.out.printf("Element is %s%n", e.value); - return value; - } - - /** Peeks into the genomic location of the record this iterator will return next. - * - * @return - */ - public GenomeLoc peekLocation() { - return queue.peek().nextLoc; - } - - public Collection allElementsLTE(RODRecordList elt) { - return allElementsLTE(elt, true); - } - - public Collection allElementsLTE(RODRecordList elt, boolean includeElt) { - LinkedList all = new LinkedList(); - - if ( includeElt ) all.add(elt); - - while ( hasNext() ) { - Element x = queue.peek(); - //System.out.printf("elt.compareTo(x) == %d%n", elt.compareTo(x)); - //System.out.printf("In allElementLTE%n"); - int cmp = elt.getLocation().compareTo(x.nextLoc); - //System.out.printf("x=%s%n elt=%s%n => elt.compareTo(x) == %d%n", x, elt, cmp); - if ( cmp >= 0 ) { - //System.out.printf(" Adding element x=%s, size = %d%n", x, all.size()); - all.add(next()); - //System.out.printf(" Added size = %d%n", all.size()); - } - else { - //System.out.printf("breaking...%n"); - break; - } - } - - return all; - } - - public void remove() { - throw new UnsupportedOperationException(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatch.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatch.java deleted file mode 100644 index 885e02d66..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatch.java +++ /dev/null @@ -1,294 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import org.broadinstitute.gatk.engine.walkers.Multiplexer; - -import java.util.*; - -/** - * A mapping of all the sites where an argument definition maps to a site on the command line. - */ -public class ArgumentMatch implements Iterable { - /** - * The argument definition that's been matched. - */ - public final ArgumentDefinition definition; - - /** - * The text that's been matched, as it appears in the command line arguments. - */ - public final String label; - - /** - * Maps indices of command line arguments to values paired with that argument. - */ - public final SortedMap> sites = new TreeMap>(); - - /** - * An ordered, freeform collection of tags. - */ - public final Tags tags; - - /** - * Create a new argument match, defining its properties later. Used to create invalid arguments. - */ - public ArgumentMatch() { - this(null,null); - } - - /** - * Minimal constructor for transform function. - * @param label Label of the argument match. Must not be null. - * @param definition The associated definition, if one exists. May be null. - */ - private ArgumentMatch(final String label, final ArgumentDefinition definition) { - this.label = label; - this.definition = definition; - this.tags = new Tags(); - } - - /** - * A simple way of indicating that an argument with the given label and definition exists at this site. - * @param label Label of the argument match. Must not be null. - * @param definition The associated definition, if one exists. May be null. - * @param site Position of the argument. Must not be null. - * @param tags ordered freeform text tags associated with this argument. - */ - public ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final Tags tags) { - this( label, definition, site, null, tags ); - } - - /** - * A simple way of indicating that an argument with the given label and definition exists at this site. - * @param label Label of the argument match. Must not be null. - * @param definition The associated definition, if one exists. May be null. - * @param site Position of the argument. Must not be null. - * @param value Value for the argument at this position. - * @param tags ordered freeform text tags associated with this argument. - */ - private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final ArgumentMatchValue value, final Tags tags) { - this.label = label; - this.definition = definition; - - ArrayList values = new ArrayList(); - if( value != null ) - values.add(value); - sites.put(site,values ); - - this.tags = tags; - } - - /** - * Check to see whether two ArgumentMatch objects are equal. - * @param other Object to which this should be compared. - * @return True if objects are equal, false if objects are not equal or incomparable. - */ - @Override - public boolean equals(Object other) { - // this clearly isn't null, since this.equals() when this == null would result in an NPE. - if(other == null) - return false; - if(!(other instanceof ArgumentMatch)) - return false; - ArgumentMatch otherArgumentMatch = (ArgumentMatch)other; - return this.definition.equals(otherArgumentMatch.definition) && - this.label.equals(otherArgumentMatch.label) && - this.sites.equals(otherArgumentMatch.sites) && - this.tags.equals(otherArgumentMatch.tags); - } - - - /** - * Reformat the given entries with the given multiplexer and key. - * TODO: Generify this. - * @param multiplexer Multiplexer that controls the transformation process. - * @param key Key which specifies the transform. - * @return A variant of this ArgumentMatch with all keys transformed. - */ - @SuppressWarnings("unchecked") - ArgumentMatch transform(Multiplexer multiplexer, Object key) { - SortedMap> newIndices = new TreeMap>(); - for(Map.Entry> site: sites.entrySet()) { - List newEntries = new ArrayList(); - for(ArgumentMatchValue entry: site.getValue()) - newEntries.add(new ArgumentMatchStringValue(multiplexer.transformArgument(key,entry.asString()))); - newIndices.put(site.getKey(),newEntries); - } - ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition); - newArgumentMatch.sites.putAll(newIndices); - return newArgumentMatch; - } - - /** - * Return a string representation of the given argument match, for debugging purposes. - * @return String representation of the match. - */ - public String toString() { - return label; - } - - /** - * Creates an iterator that walks over each individual match at each position of a given argument. - * @return An iterator over the individual matches in this argument. Will not be null. - */ - public Iterator iterator() { - return new Iterator() { - /** - * Iterate over each the available site. - */ - private Iterator siteIterator = null; - - /** - * Iterate over each available token. - */ - private Iterator tokenIterator = null; - - /** - * The next site to return. Null if none remain. - */ - ArgumentMatchSite nextSite = null; - - /** - * The next token to return. Null if none remain. - */ - ArgumentMatchValue nextToken = null; - - { - siteIterator = sites.keySet().iterator(); - prepareNext(); - } - - /** - * Is there a nextToken available to return? - * @return True if there's another token waiting in the wings. False otherwise. - */ - public boolean hasNext() { - return nextSite != null; - } - - /** - * Get the next token, if one exists. If not, throw an IllegalStateException. - * @return The next ArgumentMatch in the series. Should never be null. - */ - public ArgumentMatch next() { - if( nextSite == null ) - throw new IllegalStateException( "No more ArgumentMatches are available" ); - - ArgumentMatch match = new ArgumentMatch( label, definition, nextSite, nextToken, tags ); - prepareNext(); - return match; - } - - /** - * Initialize the next ArgumentMatch to return. If no ArgumentMatches are available, - * initialize nextSite / nextToken to null. - */ - private void prepareNext() { - if( tokenIterator != null && tokenIterator.hasNext() ) { - nextToken = tokenIterator.next(); - } - else { - nextSite = null; - nextToken = null; - - // Do a nested loop. While more data is present in the inner loop, grab that data. - // Otherwise, troll the outer iterator looking for more data. - while( siteIterator.hasNext() ) { - nextSite = siteIterator.next(); - if( sites.get(nextSite) != null ) { - tokenIterator = sites.get(nextSite).iterator(); - nextToken = tokenIterator.hasNext() ? tokenIterator.next() : null; - break; - } - } - } - - } - - /** - * Remove is unsupported in this context. - */ - public void remove() { - throw new UnsupportedOperationException("Cannot remove an argument match from the collection while iterating."); - } - }; - } - - /** - * Merge two ArgumentMatches, so that the values for all arguments go into the - * same data structure. - * @param other The other match to merge into. - */ - public void mergeInto( ArgumentMatch other ) { - sites.putAll(other.sites); - } - - /** - * Associate a value with this merge maapping. - * @param site site of the command-line argument to which this value is mated. - * @param value Text representation of value to add. - */ - public void addValue( ArgumentMatchSite site, ArgumentMatchValue value ) { - if( !sites.containsKey(site) || sites.get(site) == null ) - sites.put(site, new ArrayList() ); - sites.get(site).add(value); - } - - /** - * Does this argument already have a value at the given site? - * Arguments are only allowed to be single-valued per site, and - * flags aren't allowed a value at all. - * @param site Site at which to check for values. - * @return True if the argument has a value at the given site. False otherwise. - */ - public boolean hasValueAtSite( ArgumentMatchSite site ) { - return (sites.get(site) != null && sites.get(site).size() >= 1) || isArgumentFlag(); - } - - /** - * Return the values associated with this argument match. - * @return A collection of the string representation of these value. - */ - public List values() { - final List values = new ArrayList(); - for ( final List siteValue : sites.values() ) { - if ( siteValue != null ) - values.addAll(siteValue); - else - values.add(null); - } - return values; - } - - /** - * Convenience method returning true if the definition is a flag. - * @return True if definition is known to be a flag; false if not known to be a flag. - */ - private boolean isArgumentFlag() { - return definition != null && definition.isFlag; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatches.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatches.java deleted file mode 100644 index 2d81cfcaa..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatches.java +++ /dev/null @@ -1,211 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import org.broadinstitute.gatk.engine.walkers.Multiplexer; - -import java.util.*; -/** - * Represents a list of potential matches between the arguments defined - * by the argument sources and the arguments passed in via the command line. - */ -public class ArgumentMatches implements Iterable { - /** - * Collection matches from argument definition to argument value. - * Package protected access is deliberate. - */ - Map argumentMatches = new TreeMap(); - - /** - * Provide a place to put command-line argument values that don't seem to belong to - * any particular command-line option. - */ - ArgumentMatch MissingArgument = new ArgumentMatch(); - - /** - * Get an iterator cycling through *unique* command-line argument <-> definition matches. - * @return Iterator over all argument matches. - */ - public Iterator iterator() { - return getUniqueMatches().iterator(); - } - - /** - * Create an empty ArgumentMatches object. - */ - public ArgumentMatches() { - } - - /** - * Create a singleton ArgumentMatches object. - * @param match Match to incorporate. - */ - public ArgumentMatches( ArgumentMatch match ) { - mergeInto( match ); - } - - /** - * Returns the number of matches in this structure. - * @return Count of the matches in this structure. - */ - public int size() { - return argumentMatches.size(); - } - - /** - * Indicates whether the site contains a matched argument. - * @param site Site at which to check. - * @return True if the site has a match. False otherwise. - */ - boolean hasMatch( ArgumentMatchSite site ) { - return argumentMatches.containsKey( site ); - } - - /** - * Gets the match at a given site. - * @param site Site at which to look for a match. - * @return The match present at the given site. - * @throws IllegalArgumentException if site does not contain a match. - */ - ArgumentMatch getMatch( ArgumentMatchSite site ) { - if( !argumentMatches.containsKey(site) ) - throw new IllegalArgumentException( "Site does not contain an argument: " + site ); - return argumentMatches.get(site); - } - - /** - * Does the match collection have a match for this argument definition. - * @param definition Definition to match. - * @return True if a match exists; false otherwise. - */ - boolean hasMatch( ArgumentDefinition definition ) { - return findMatches( definition ).size() > 0; - } - - /** - * Return all argument matches of this source. - * @param parsingEngine Parsing engine. - * @param argumentSource Argument source to match. - * @return List of all matches. - */ - - ArgumentMatches findMatches(ParsingEngine parsingEngine, ArgumentSource argumentSource) { - List sourceDefinitions = parsingEngine.selectBestTypeDescriptor(argumentSource.field.getType()).createArgumentDefinitions(argumentSource); - - ArgumentMatches matches = new ArgumentMatches(); - for( ArgumentMatch argumentMatch: getUniqueMatches() ) { - if( sourceDefinitions.contains(argumentMatch.definition) ) - matches.mergeInto( argumentMatch ); - } - return matches; - } - - /** - * Return all argument matches of this definition. - * @param definition Argument definition to match. - * @return List of all matches. - */ - ArgumentMatches findMatches( ArgumentDefinition definition ) { - ArgumentMatches matches = new ArgumentMatches(); - for( ArgumentMatch argumentMatch: argumentMatches.values() ) { - if( argumentMatch.definition == definition ) - matches.mergeInto( argumentMatch ); - } - return matches; - } - - /** - * Find all successful matches (a 'successful' match is one paired with a definition). - * @return All successful matches. - */ - ArgumentMatches findSuccessfulMatches() { - ArgumentMatches matches = new ArgumentMatches(); - for( ArgumentMatch argumentMatch: argumentMatches.values() ) { - if( argumentMatch.definition != null ) - matches.mergeInto( argumentMatch ); - } - return matches; - } - - /** - * Find arguments that are unmatched to any definition. - * @return Set of matches that have no associated definition. - */ - ArgumentMatches findUnmatched() { - ArgumentMatches matches = new ArgumentMatches(); - for( ArgumentMatch argumentMatch: argumentMatches.values() ) { - if( argumentMatch.definition == null ) - matches.mergeInto( argumentMatch ); - } - return matches; - } - - /** - * Reformat the given entries with the given multiplexer and key. - * TODO: Generify this. - * @param multiplexer Multiplexer that controls the transformation process. - * @param key Key which specifies the transform. - * @return new argument matches. - */ - ArgumentMatches transform(Multiplexer multiplexer, Object key) { - ArgumentMatches newArgumentMatches = new ArgumentMatches(); - for(ArgumentMatch match: argumentMatches.values()) - newArgumentMatches.mergeInto(match.transform(multiplexer,key)); - return newArgumentMatches; - } - - /** - * Merges the given argument match into the set of existing argument matches. - * If multiple arguments are present, those arguments will end up grouped. - * @param match The match to merge into. - */ - void mergeInto( ArgumentMatch match ) { - boolean definitionExists = false; - - // Clone the list of argument matches to avoid ConcurrentModificationExceptions. - for( ArgumentMatch argumentMatch: getUniqueMatches() ) { - if( argumentMatch.definition == match.definition && argumentMatch.tags.equals(match.tags) ) { - argumentMatch.mergeInto( match ); - for( ArgumentMatchSite site: match.sites.keySet() ) - argumentMatches.put( site, argumentMatch ); - definitionExists = true; - } - } - - if( !definitionExists ) { - for( ArgumentMatchSite site: match.sites.keySet() ) - argumentMatches.put( site, match ); - } - } - - /** - * Determines, of the argument matches by position, which are unique and returns that list. - * @return A unique set of matches. - */ - private Set getUniqueMatches() { - return new LinkedHashSet( argumentMatches.values() ); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptor.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptor.java deleted file mode 100644 index 5bfc5166b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptor.java +++ /dev/null @@ -1,1030 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import org.apache.log4j.Logger; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.engine.walkers.Multiplex; -import org.broadinstitute.gatk.engine.walkers.Multiplexer; -import org.broadinstitute.gatk.utils.classloader.JVMUtils; -import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.text.XReadLines; - -import java.io.File; -import java.io.IOException; -import java.lang.annotation.Annotation; -import java.lang.reflect.*; -import java.util.*; - -/** - * An descriptor capable of providing parsers that can parse any type - * of supported command-line argument. - * - * @author mhanna - * @version 0.1 - */ -public abstract class ArgumentTypeDescriptor { - private static Class[] ARGUMENT_ANNOTATIONS = {Input.class, Output.class, Argument.class}; - - /** - * our log, which we want to capture anything from org.broadinstitute.gatk - */ - protected static final Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class); - - /** - * Fetch the given descriptor from the descriptor repository. - * @param descriptors the descriptors from which to select a good match. - * @param type Class for which to specify a descriptor. - * @return descriptor for the given type. - */ - public static ArgumentTypeDescriptor selectBest( Collection descriptors, Class type ) { - for( ArgumentTypeDescriptor descriptor: descriptors ) { - if( descriptor.supports(type) ) - return descriptor; - } - throw new ReviewedGATKException("Can't process command-line arguments of type: " + type.getName()); - } - - /** - * Does this descriptor support classes of the given type? - * @param type The type to check. - * @return true if this descriptor supports the given type, false otherwise. - */ - public abstract boolean supports( Class type ); - - /** - * Returns false if a type-specific default can be employed. - * @param source Source of the command-line argument. - * @return True to throw in a type specific default. False otherwise. - */ - public boolean createsTypeDefault(ArgumentSource source) { return false; } - - /** - * Returns a documentation-friendly value for the default of a type descriptor. - * Must be overridden if createsTypeDefault return true. cannot be called otherwise - * @param source Source of the command-line argument. - * @return Friendly string of the default value, for documentation. If doesn't create a default, throws - * and UnsupportedOperationException - */ - public String typeDefaultDocString(ArgumentSource source) { - throw new UnsupportedOperationException(); - } - - /** - * Generates a default for the given type. - * - * @param parsingEngine the parsing engine used to validate this argument type descriptor. - * @param source Source of the command-line argument. - * @param type Type of value to create, in case the command-line argument system wants influence. - * @return A default value for the given type. - */ - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } - - /** - * Given the given argument source and attributes, synthesize argument definitions for command-line arguments. - * @param source Source class and field for the given argument. - * @return A list of command-line argument definitions supporting this field. - */ - public List createArgumentDefinitions( ArgumentSource source ) { - return Collections.singletonList(createDefaultArgumentDefinition(source)); - } - - /** - * Parses an argument source to an object. - * WARNING! Mandatory side effect of parsing! Each parse routine should register the tags it finds with the proper CommandLineProgram. - * TODO: Fix this, perhaps with an event model indicating that a new argument has been created. - * - * @param parsingEngine The engine responsible for parsing. - * @param source The source used to find the matches. - * @param matches The matches for the source. - * @return The parsed object. - */ - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, ArgumentMatches matches) { - return parse(parsingEngine, source, source.field.getGenericType(), matches); - } - - /** - * Returns true if the field is a collection or an array. - * @param source The argument source to check. - * @return true if the field is a collection or an array. - */ - public boolean isMultiValued( ArgumentSource source ) { - Class argumentType = source.field.getType(); - return Collection.class.isAssignableFrom(argumentType) || argumentType.isArray(); - } - - /** - * By default, argument sources create argument definitions with a set of default values. - * Use this method to create the one simple argument definition. - * @param source argument source for which to create a default definition. - * @return The default definition for this argument source. - */ - protected ArgumentDefinition createDefaultArgumentDefinition( ArgumentSource source ) { - Annotation argumentAnnotation = getArgumentAnnotation(source); - return new ArgumentDefinition( ArgumentIOType.getIOType(argumentAnnotation), - source.field.getType(), - ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), - ArgumentDefinition.getShortName(argumentAnnotation), - ArgumentDefinition.getDoc(argumentAnnotation), - source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), - source.isFlag(), - source.isMultiValued(), - source.isHidden(), - makeRawTypeIfNecessary(getCollectionComponentType(source.field)), - ArgumentDefinition.getExclusiveOf(argumentAnnotation), - ArgumentDefinition.getValidationRegex(argumentAnnotation), - getValidOptions(source) ); - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - protected Type getCollectionComponentType( Field field ) { - return null; - } - - /** - * Parses the argument matches for a class type into an object. - * @param source The original argument source used to find the matches. - * @param type The current class type being inspected. May not match the argument source.field.getType() if this as a collection for example. - * @param matches The argument matches for the argument source, or the individual argument match for a scalar if this is being called to help parse a collection. - * @return The individual parsed object matching the argument match with Class type. - */ - public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ); - - /** - * If the argument source only accepts a small set of options, populate the returned list with - * those options. Otherwise, leave the list empty. - * @param source Original field specifying command-line arguments. - * @return A list of valid options. - */ - protected List getValidOptions( ArgumentSource source ) { - if(!source.field.getType().isEnum()) - return null; - List validOptions = new ArrayList(); - for(Object constant: source.field.getType().getEnumConstants()) - validOptions.add(constant.toString()); - return validOptions; - } - - /** - * Returns true if the argument with the given full name exists in the collection of ArgumentMatches. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return true if the argument is present, or false if not present. - */ - protected boolean argumentIsPresent( ArgumentDefinition definition, ArgumentMatches matches ) { - for( ArgumentMatch match: matches ) { - if( match.definition.equals(definition) ) - return true; - } - return false; - } - - /** - * Gets the value of an argument with the given full name, from the collection of ArgumentMatches. - * If the argument matches multiple values, an exception will be thrown. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or null if not present. - */ - protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) { - Collection argumentValues = getArgumentValues( definition, matches ); - if( argumentValues.size() > 1 ) - throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName); - return argumentValues.size() > 0 ? argumentValues.iterator().next() : null; - } - - /** - * Gets the tags associated with a given command-line argument. - * If the argument matches multiple values, an exception will be thrown. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or null if not present. - */ - protected Tags getArgumentTags(ArgumentMatches matches) { - Tags tags = new Tags(); - for(ArgumentMatch match: matches) { - if(!tags.isEmpty() && !match.tags.isEmpty()) - throw new ReviewedGATKException("BUG: multiple conflicting sets of tags are available, and the type descriptor specifies no way of resolving the conflict."); - tags = match.tags; - } - return tags; - } - - /** - * Gets the values of an argument with the given full name, from the collection of ArgumentMatches. - * @param definition Definition of the argument for which to find matches. - * @param matches The matches for the given argument. - * @return The value of the argument if available, or an empty collection if not present. - */ - protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) { - Collection values = new ArrayList(); - for( ArgumentMatch match: matches ) { - if( match.definition.equals(definition) ) - values.addAll(match.values()); - } - return values; - } - - /** - * Retrieves the argument description from the given argument source. Will throw an exception if - * the given ArgumentSource - * @param source source of the argument. - * @return Argument description annotation associated with the given field. - */ - @SuppressWarnings("unchecked") - protected static Annotation getArgumentAnnotation( ArgumentSource source ) { - for (Class annotation: ARGUMENT_ANNOTATIONS) - if (source.field.isAnnotationPresent(annotation)) - return source.field.getAnnotation(annotation); - throw new ReviewedGATKException("ArgumentAnnotation is not present for the argument field: " + source.field.getName()); - } - - /** - * Returns true if an argument annotation is present - * @param field The field to check for an annotation. - * @return True if an argument annotation is present on the field. - */ - @SuppressWarnings("unchecked") - public static boolean isArgumentAnnotationPresent(Field field) { - for (Class annotation: ARGUMENT_ANNOTATIONS) - if (field.isAnnotationPresent(annotation)) - return true; - return false; - } - - /** - * Returns true if the given annotation is hidden from the help system. - * @param field Field to test. - * @return True if argument should be hidden. False otherwise. - */ - public static boolean isArgumentHidden(Field field) { - return field.isAnnotationPresent(Hidden.class); - } - - public static Class makeRawTypeIfNecessary(Type t) { - if ( t == null ) - return null; - else if ( t instanceof ParameterizedType ) - return (Class)((ParameterizedType) t).getRawType(); - else if ( t instanceof Class ) { - return (Class)t; - } else { - throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); - } - } - - /** - * The actual argument parsing method. - * @param source source - * @param type type to check - * @param matches matches - * @param tags argument tags - * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. - */ - protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); - @SuppressWarnings("unchecked") - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - String name = defaultDefinition.fullName; - - return parseBinding(value, parameterType, type, name, tags, source.field.getName()); - } - - /** - * - * @param value The source of the binding - * @param parameterType The Tribble Feature parameter type - * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. - * @param bindingName The name of the binding passed to the constructor. - * @param tags Tags for the binding used for parsing and passed to the constructor. - * @param fieldName The name of the field that was parsed. Used for error reporting. - * @return The newly created binding object of type bindingClass. - */ - public static Object parseBinding(ArgumentMatchValue value, Class parameterType, Type bindingClass, - String bindingName, Tags tags, String fieldName) { - try { - String tribbleType = null; - // must have one or two tag values here - if ( tags.getPositionalTags().size() > 2 ) { - throw new UserException.CommandLineException( - String.format("Unexpected number of positional tags for argument %s : %s. " + - "Rod bindings only support -X:type and -X:name,type argument styles", - value.asString(), fieldName)); - } else if ( tags.getPositionalTags().size() == 2 ) { - // -X:name,type style - bindingName = tags.getPositionalTags().get(0); - tribbleType = tags.getPositionalTags().get(1); - - FeatureManager manager = new FeatureManager(); - if ( manager.getByName(tribbleType) == null ) - throw new UserException.UnknownTribbleType( - tribbleType, - String.format("Unable to find tribble type '%s' provided on the command line. " + - "Please select a correct type from among the supported types:%n%s", - tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); - - } else { - // case with 0 or 1 positional tags - FeatureManager manager = new FeatureManager(); - - // -X:type style is a type when we cannot determine the type dynamically - String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; - if ( tag1 != null ) { - if ( manager.getByName(tag1) != null ) // this a type - tribbleType = tag1; - else - bindingName = tag1; - } - - if ( tribbleType == null ) { - // try to determine the file type dynamically - File file = value.asFile(); - if ( file.canRead() && file.isFile() ) { - FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); - if ( featureDescriptor != null ) { - tribbleType = featureDescriptor.getName(); - logger.debug("Dynamically determined type of " + file + " to be " + tribbleType); - } - } - - if ( tribbleType == null ) { - // IntervalBinding can be created from a normal String - Class rawType = (makeRawTypeIfNecessary(bindingClass)); - try { - return rawType.getConstructor(String.class).newInstance(value.asString()); - } catch (NoSuchMethodException e) { - /* ignore */ - } - - if ( ! file.exists() ) { - throw new UserException.CouldNotReadInputFile(file, "file does not exist"); - } else if ( ! file.canRead() || ! file.isFile() ) { - throw new UserException.CouldNotReadInputFile(file, "file could not be read"); - } else { - throw new UserException.CommandLineException( - String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + - "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", - manager.userFriendlyListOfAvailableFeatures(parameterType))); - } - } - } - } - - Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); - return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags); - } catch (Exception e) { - if ( e instanceof UserException ) - throw ((UserException)e); - else - throw new UserException.CommandLineException( - String.format("Failed to parse value %s for argument %s. Message: %s", - value, fieldName, e.getMessage())); - } - } - - /** - * Parse the source of a RodBindingCollection, which can be either a file of RodBindings or an actual RodBinding. - * - * @param parsingEngine the parsing engine used to validate this argument type descriptor - * @param source source - * @param type type - * @param matches matches - * @param tags argument tags - * @return the newly created binding object - */ - public Object parseRodBindingCollectionSource(final ParsingEngine parsingEngine, - final ArgumentSource source, - final Type type, - final ArgumentMatches matches, - final Tags tags) { - - final ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - final ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); - @SuppressWarnings("unchecked") - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - String name = defaultDefinition.fullName; - - // if this a list of files, get those bindings - final File file = value.asFile(); - try { - if (file.getAbsolutePath().endsWith(".list")) { - return getRodBindingsCollection(file, parsingEngine, parameterType, name, tags, source.field.getName()); - } - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - - // otherwise, treat this as an individual binding - final RodBinding binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, name, tags, source.field.getName()); - parsingEngine.addTags(binding, tags); - parsingEngine.addRodBinding(binding); - return RodBindingCollection.createRodBindingCollectionOfType(parameterType, Arrays.asList(binding)); - } - - /** - * Retrieve and parse a collection of RodBindings from the given file. - * - * If the file contains duplicate entries or is empty, an exception will be thrown. - * - * @param file the source file - * @param parsingEngine the engine responsible for parsing - * @param parameterType the Tribble Feature parameter type - * @param bindingName the name of the binding passed to the constructor. - * @param defaultTags general tags for the binding used for parsing and passed to the constructor. - * @param fieldName the name of the field that was parsed. Used for error reporting. - * @return the newly created collection of binding objects. - */ - public static Object getRodBindingsCollection(final File file, - final ParsingEngine parsingEngine, - final Class parameterType, - final String bindingName, - final Tags defaultTags, - final String fieldName) throws IOException { - final List bindings = new ArrayList<>(); - - // Keep track of the files in this list so that we can check for duplicates and empty files - final Set fileValues = new HashSet<>(); - - // parse each line separately using the given Tags if none are provided on each line - for ( final String line: new XReadLines(file) ) { - final String[] tokens = line.split("\\s+"); - final RodBinding binding; - - if ( tokens.length == 0 ) { - continue; // empty line, so do nothing - } - // use the default tags if none are provided for this binding - else if ( tokens.length == 1 ) { - final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[0], fileValues, fieldName, file.getName()); - binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, defaultTags, fieldName); - parsingEngine.addTags(binding, defaultTags); - - } - // use the new tags if provided - else if ( tokens.length == 2 ) { - final Tags tags = ParsingMethod.parseTags(fieldName, tokens[0]); - final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[1], fileValues, fieldName, file.getName()); - binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, tags, fieldName); - parsingEngine.addTags(binding, tags); - } else { - throw new UserException.BadArgumentValue(fieldName, "data lines should consist of an optional set of tags along with a path to a file; too many tokens are present for line: " + line); - } - - bindings.add(binding); - parsingEngine.addRodBinding(binding); - } - - if (fileValues.isEmpty()) { - throw new UserException.BadArgumentValue(fieldName, "The input list " + file.getName() + " is empty."); - } - - return RodBindingCollection.createRodBindingCollectionOfType(parameterType, bindings); - } - - /** - * Validates the resource file name and constructs an ArgumentMatchValue from it. - * - * If the list name has already been processed in the current list, throws a UserException, otherwise - * creates an ArgumentMatchValue to represent the list. - * - * @param token Name of the ROD resource file. - * @param fileValues Set of names of ROD files that have already been processed. - * @param fieldName Name of the argument field being populated. - * @param listFileName Name of the list file being processed. - * @return - */ - private static ArgumentMatchValue parseAndValidateArgumentMatchValue(final String token, final Set fileValues, final String fieldName, - final String listFileName) { - checkForDuplicateFileName(token, fileValues, fieldName, listFileName); - return new ArgumentMatchStringValue(token); - } - - /** - * Checks to make sure that the current file name to be processed has not already been processed. - * - * Checks the name of the current file against the names that have already been processed, throwing - * an informative BadArgumentValue exception if it has already been seen. As a side effect adds the - * current file name to the set of filenames that have already been processed. - * - * @param currentFile Name of the current file to process - * @param processedFiles Set of file names that have already been processed - * @param fieldName Name of the argument that is being populated - * @param listName Filename of the list that is being processed - */ - protected static void checkForDuplicateFileName(final String currentFile, final Set processedFiles, - final String fieldName, final String listName) { - if (processedFiles.contains(currentFile)) { - throw new UserException.BadArgumentValue(fieldName, "The input list " + listName + " contains file " + currentFile + - " multiple times, which isn't allowed. If you are intentionally trying to " + - "include the same file more than once, you will need to specify it in separate file lists."); - } - processedFiles.add(currentFile); - } -} - -/** - * Parser for RodBinding objects - */ -class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * We only want RodBinding class objects - * @param type The type to check. - * @return true if the provided class is a RodBinding.class - */ - @Override - public boolean supports( Class type ) { - return isRodBinding(type); - } - - public static boolean isRodBinding( Class type ) { - return RodBinding.class.isAssignableFrom(type); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } - - @Override - @SuppressWarnings("unchecked") - public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { - Class parameterType = JVMUtils.getParameterizedTypeClass(type); - return RodBinding.makeUnbound((Class)parameterType); - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "none"; - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - Tags tags = getArgumentTags(matches); - RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); - parsingEngine.addTags(rbind, tags); - parsingEngine.addRodBinding(rbind); - return rbind; - } -} - -/** - * Parser for IntervalBinding objects - */ -class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * We only want IntervalBinding class objects - * @param type The type to check. - * @return true if the provided class is an IntervalBinding.class - */ - @Override - public boolean supports( Class type ) { - return isIntervalBinding(type); - } - - public static boolean isIntervalBinding( Class type ) { - return IntervalBinding.class.isAssignableFrom(type); - } - - /** - * See note from RodBindingArgumentTypeDescriptor.parse(). - * - * @param parsingEngine parsing engine - * @param source source - * @param type type to check - * @param matches matches - * @return the IntervalBinding object. - */ - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - return parseBinding(source, type, matches, getArgumentTags(matches)); - } -} - -/** - * Parser for RodBindingCollection objects - */ -class RodBindingCollectionArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * We only want RodBindingCollection class objects - * @param type The type to check. - * @return true if the provided class is an RodBindingCollection.class - */ - @Override - public boolean supports( final Class type ) { - return isRodBindingCollection(type); - } - - public static boolean isRodBindingCollection( final Class type ) { - return RodBindingCollection.class.isAssignableFrom(type); - } - - /** - * See note from RodBindingArgumentTypeDescriptor.parse(). - * - * @param parsingEngine parsing engine - * @param source source - * @param type type to check - * @param matches matches - * @return the IntervalBinding object. - */ - @Override - public Object parse(final ParsingEngine parsingEngine, final ArgumentSource source, final Type type, final ArgumentMatches matches) { - final Tags tags = getArgumentTags(matches); - return parseRodBindingCollectionSource(parsingEngine, source, type, matches, tags); - } -} - -/** - * Parse simple argument types: java primitives, wrapper classes, and anything that has - * a simple String constructor. - */ -class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { - - /** - * @param type the class type - * @return true if this class is a binding type, false otherwise - */ - private boolean isBinding(final Class type) { - return RodBindingArgumentTypeDescriptor.isRodBinding(type) || - IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) || - RodBindingCollectionArgumentTypeDescriptor.isRodBindingCollection(type); - } - - - @Override - public boolean supports( Class type ) { - if ( isBinding(type) ) return false; - if ( type.isPrimitive() ) return true; - if ( type.isEnum() ) return true; - if ( primitiveToWrapperMap.containsValue(type) ) return true; - - try { - type.getConstructor(String.class); - return true; - } - catch( Exception ex ) { - // An exception thrown above means that the String constructor either doesn't - // exist or can't be accessed. In either case, this descriptor doesn't support this type. - return false; - } - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type fulltype, ArgumentMatches matches) { - Class type = makeRawTypeIfNecessary(fulltype); - if (source.isFlag()) - return true; - - ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); - ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); - Object result; - Tags tags = getArgumentTags(matches); - - // lets go through the types we support - try { - if (type.isPrimitive()) { - Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class); - if(value == null) - throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); - result = valueOf.invoke(null,value.asString().trim()); - } else if (type.isEnum()) { - Object[] vals = type.getEnumConstants(); - Object defaultEnumeration = null; // as we look at options, record the default option if it exists - for (Object val : vals) { - if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val; - try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } - catch (NoSuchFieldException e) { throw new ReviewedGATKException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } - } - // if their argument has no value (null), and there's a default, return that default for the enum value - if (defaultEnumeration != null && value == null) - result = defaultEnumeration; - // if their argument has no value and there's no default, throw a missing argument value exception. - // TODO: Clean this up so that null values never make it to this point. To fix this, we'll have to clean up the implementation of -U. - else if (value == null) - throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); - else - throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString()); - } else if (type.equals(File.class)) { - result = value == null ? null : value.asFile(); - } else { - if (value == null) - throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); - Constructor ctor = type.getConstructor(String.class); - result = ctor.newInstance(value.asString()); - } - } catch (UserException e) { - throw e; - } catch (InvocationTargetException e) { - throw new UserException.CommandLineException(String.format("Failed to parse value %s for argument %s. This is most commonly caused by providing an incorrect data type (e.g. a double when an int is required)", - value, source.field.getName())); - } catch (Exception e) { - throw new DynamicClassResolutionException(String.class, e); - } - - // TODO FIXME! - - // WARNING: Side effect! - parsingEngine.addTags(result,tags); - - return result; - } - - - /** - * A mapping of the primitive types to their associated wrapper classes. Is there really no way to infer - * this association available in the JRE? - */ - private static Map primitiveToWrapperMap = new HashMap() { - { - put( Boolean.TYPE, Boolean.class ); - put( Character.TYPE, Character.class ); - put( Byte.TYPE, Byte.class ); - put( Short.TYPE, Short.class ); - put( Integer.TYPE, Integer.class ); - put( Long.TYPE, Long.class ); - put( Float.TYPE, Float.class ); - put( Double.TYPE, Double.class ); - } - }; -} - -/** - * Process compound argument types: arrays, and typed and untyped collections. - */ -class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { - @Override - public boolean supports( Class type ) { - return ( Collection.class.isAssignableFrom(type) || type.isArray() ); - } - - @Override - @SuppressWarnings("unchecked") - public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Type fulltype, ArgumentMatches matches) { - Class type = makeRawTypeIfNecessary(fulltype); - Type componentType; - Object result; - - if( Collection.class.isAssignableFrom(type) ) { - - // If this is a generic interface, pick a concrete implementation to create and pass back. - // Because of type erasure, don't worry about creating one of exactly the correct type. - if( Modifier.isInterface(type.getModifiers()) || Modifier.isAbstract(type.getModifiers()) ) - { - if( java.util.List.class.isAssignableFrom(type) ) type = ArrayList.class; - else if( java.util.Queue.class.isAssignableFrom(type) ) type = java.util.ArrayDeque.class; - else if( java.util.Set.class.isAssignableFrom(type) ) type = java.util.TreeSet.class; - } - - componentType = getCollectionComponentType( source.field ); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); - - Collection collection; - try { - collection = (Collection)type.newInstance(); - } - catch (InstantiationException e) { - logger.fatal("ArgumentParser: InstantiationException: cannot convert field " + source.field.getName()); - throw new ReviewedGATKException("constructFromString:InstantiationException: Failed conversion " + e.getMessage()); - } - catch (IllegalAccessException e) { - logger.fatal("ArgumentParser: IllegalAccessException: cannot convert field " + source.field.getName()); - throw new ReviewedGATKException("constructFromString:IllegalAccessException: Failed conversion " + e.getMessage()); - } - - for( ArgumentMatch match: matches ) { - for( ArgumentMatch value: match ) { - Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); - collection.add( object ); - // WARNING: Side effect! - parsingEngine.addTags(object,value.tags); - } - } - - result = collection; - - } - else if( type.isArray() ) { - componentType = type.getComponentType(); - ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); - - // Assemble a collection of individual values used in this computation. - Collection values = new ArrayList(); - for( ArgumentMatch match: matches ) - for( ArgumentMatch value: match ) - values.add(value); - - result = Array.newInstance(makeRawTypeIfNecessary(componentType),values.size()); - - int i = 0; - for( ArgumentMatch value: values ) { - Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); - Array.set(result,i++,object); - // WARNING: Side effect! - parsingEngine.addTags(object,value.tags); - } - } - else - throw new ReviewedGATKException("Unsupported compound argument type: " + type); - - return result; - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - @Override - protected Type getCollectionComponentType( Field field ) { - // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. - if( field.getGenericType() instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); - if( parameterizedType.getActualTypeArguments().length > 1 ) - throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); - return parameterizedType.getActualTypeArguments()[0]; - } - else - return String.class; - } -} - -class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { - /** - * The multiplexer controlling how data is split. - */ - private final Multiplexer multiplexer; - - /** - * The set of identifiers for the multiplexed entries. - */ - private final Collection multiplexedIds; - - public MultiplexArgumentTypeDescriptor() { - this.multiplexer = null; - this.multiplexedIds = null; - } - - /** - * Private constructor to use in creating a closure of the MultiplexArgumentTypeDescriptor specific to the - * given set of multiplexed ids. - * @param multiplexedIds The collection of multiplexed entries - */ - private MultiplexArgumentTypeDescriptor(final Multiplexer multiplexer, final Collection multiplexedIds) { - this.multiplexer = multiplexer; - this.multiplexedIds = multiplexedIds; - } - - @Override - public boolean supports( Class type ) { - return ( Map.class.isAssignableFrom(type) ); - } - - @Override - public boolean createsTypeDefault(ArgumentSource source) { - // Multiplexing always creates a type default. - return true; - } - - @Override - public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { - if(multiplexer == null || multiplexedIds == null) - throw new ReviewedGATKException("No multiplexed ids available"); - - Map multiplexedMapping = new HashMap(); - Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); - ArgumentTypeDescriptor componentTypeDescriptor = parsingEngine.selectBestTypeDescriptor(componentType); - - for(Object id: multiplexedIds) { - Object value = null; - if(componentTypeDescriptor.createsTypeDefault(source)) - value = componentTypeDescriptor.createTypeDefault(parsingEngine,source,componentType); - multiplexedMapping.put(id,value); - } - return multiplexedMapping; - } - - @Override - public String typeDefaultDocString(ArgumentSource source) { - return "None"; - } - - @Override - public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { - if(multiplexedIds == null) - throw new ReviewedGATKException("Cannot directly parse a MultiplexArgumentTypeDescriptor; must create a derivative type descriptor first."); - - Map multiplexedMapping = new HashMap(); - - Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); - - - for(Object id: multiplexedIds) { - Object value = parsingEngine.selectBestTypeDescriptor(componentType).parse(parsingEngine,source,componentType,matches.transform(multiplexer,id)); - multiplexedMapping.put(id,value); - } - - parsingEngine.addTags(multiplexedMapping,getArgumentTags(matches)); - - return multiplexedMapping; - } - - public MultiplexArgumentTypeDescriptor createCustomTypeDescriptor(ParsingEngine parsingEngine,ArgumentSource dependentArgument,Object containingObject) { - String[] sourceFields = dependentArgument.field.getAnnotation(Multiplex.class).arguments(); - - List allSources = parsingEngine.extractArgumentSources(containingObject.getClass()); - Class[] sourceTypes = new Class[sourceFields.length]; - Object[] sourceValues = new Object[sourceFields.length]; - int currentField = 0; - - for(String sourceField: sourceFields) { - boolean fieldFound = false; - for(ArgumentSource source: allSources) { - if(!source.field.getName().equals(sourceField)) - continue; - if(source.field.isAnnotationPresent(Multiplex.class)) - throw new ReviewedGATKException("Command-line arguments can only depend on independent fields"); - sourceTypes[currentField] = source.field.getType(); - sourceValues[currentField] = JVMUtils.getFieldValue(source.field,containingObject); - currentField++; - fieldFound = true; - } - if(!fieldFound) - throw new ReviewedGATKException(String.format("Unable to find source field %s, referred to by dependent field %s",sourceField,dependentArgument.field.getName())); - } - - Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); - Constructor multiplexerConstructor; - try { - multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); - multiplexerConstructor.setAccessible(true); - } - catch(NoSuchMethodException ex) { - throw new ReviewedGATKException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - - Multiplexer multiplexer; - try { - multiplexer = multiplexerConstructor.newInstance(sourceValues); - } - catch(IllegalAccessException ex) { - throw new ReviewedGATKException(String.format("Constructor for class %s with parameters %s is inaccessible",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - catch(InstantiationException ex) { - throw new ReviewedGATKException(String.format("Can't create class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - catch(InvocationTargetException ex) { - throw new ReviewedGATKException(String.format("Can't invoke constructor of class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); - } - - return new MultiplexArgumentTypeDescriptor(multiplexer,multiplexer.multiplex()); - } - - /** - * Return the component type of a field, or String.class if the type cannot be found. - * @param field The reflected field to inspect. - * @return The parameterized component type, or String.class if the parameterized type could not be found. - * @throws IllegalArgumentException If more than one parameterized type is found on the field. - */ - @Override - protected Type getCollectionComponentType( Field field ) { - // Multiplex arguments must resolve to maps from which the clp should extract the second type. - if( field.getGenericType() instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); - if( parameterizedType.getActualTypeArguments().length != 2 ) - throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); - return (Class)parameterizedType.getActualTypeArguments()[1]; - } - else - return String.class; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java deleted file mode 100644 index 80ebe2c23..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java +++ /dev/null @@ -1,447 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import org.apache.log4j.FileAppender; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.PatternLayout; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.help.ApplicationDetails; -import org.broadinstitute.gatk.utils.help.HelpConstants; -import org.broadinstitute.gatk.utils.help.HelpFormatter; - -import java.io.IOException; -import java.util.*; - -public abstract class CommandLineProgram { - - /** The command-line program and the arguments it returned. */ - public ParsingEngine parser = null; - - /** - * Setting INFO gets you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging, and so on. - */ - @Argument(fullName = "logging_level", shortName = "l", doc = "Set the minimum level of logging", required = false) - protected String logging_level = "INFO"; - - /** - * File to save the logging output. - */ - @Output(fullName = "log_to_file", shortName = "log", doc = "Set the logging location", required = false) - protected String toFile = null; - - /** - * This will produce a help message in the terminal with general usage information, listing available arguments - * as well as tool-specific information if applicable. - */ - @Argument(fullName = "help", shortName = "h", doc = "Generate the help message", required = false) - public Boolean help = false; - - /** - * Use this to check the version number of the GATK executable you are invoking. Note that the version number is - * always included in the output at the start of every run as well as any error message. - */ - @Argument(fullName = "version", shortName = "version", doc ="Output version information", required = false) - public Boolean version = false; - - - /** our logging output patterns */ - private static final String patternString = "%-5p %d{HH:mm:ss,SSS} %C{1} - %m %n"; - - static { - /** - * The very first thing that any GATK application does is forces the JVM locale into US English, so that we don't have - * to think about number formatting issues. - */ - forceJVMLocaleToUSEnglish(); - // setup a basic log configuration - CommandLineUtils.configureConsoleLogging(); - } - - - /** - * Allows a given application to return a brief description of itself. - * - * @return An ApplicationDetails object describing the current application. Should not be null. - */ - protected ApplicationDetails getApplicationDetails() { - return new ApplicationDetails(ApplicationDetails.createDefaultHeader(getClass()), - Collections.emptyList(), - ApplicationDetails.createDefaultRunningInstructions(getClass()), - null); - } - - /** - * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. - * @return A collection of type descriptors generating implementation-dependent placeholders. - */ - protected Collection getArgumentTypeDescriptors() { - return Collections.emptyList(); - } - - /** - * Will this application want to vary its argument list dynamically? - * If so, parse the command-line options and then prompt the subclass to return - * a list of argument providers. - * - * @return Whether the application should vary command-line arguments dynamically. - */ - protected boolean canAddArgumentsDynamically() { return false; } - - /** - * Provide a list of object to inspect, looking for additional command-line arguments. - * - * @return A list of objects to inspect. - */ - protected Class[] getArgumentSources() { - return new Class[]{}; - } - - /** - * Name this argument source. Provides the (full) class name as a default. - * - * @param source The argument source. - * - * @return a name for the argument source. - */ - protected String getArgumentSourceName( Class source ) { return source.toString(); } - - /** - * Sets the command-line parsing engine. Necessary for unit testing purposes. - * @param parser the new command-line parsing engine - */ - public void setParser( ParsingEngine parser ) { - this.parser = parser; - } - - /** - * this is the function that the inheriting class can expect to have called - * when all the argument processing is done - * - * @return the return code to exit the program with - * @throws Exception when an exception occurs - */ - protected abstract int execute() throws Exception; - - public static int result = -1; - - @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args) throws Exception { - start(clp, args, false); - } - - /** - * This function is called to start processing the command line, and kick - * off the execute message of the program. - * - * @param clp the command line program to execute - * @param args the command line arguments passed in - * @param dryRun dry run - * @throws Exception when an exception occurs - */ - @SuppressWarnings("unchecked") - public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { - - try { - // setup our log layout - PatternLayout layout = new PatternLayout(); - - Logger logger = CommandLineUtils.getStingLogger(); - - // now set the layout of all the loggers to our layout - CommandLineUtils.setLayout(logger, layout); - - // Initialize the logger using the defaults. - clp.setupLoggerLevel(layout); - - // setup the parser - ParsingEngine parser = clp.parser = new ParsingEngine(clp); - parser.addArgumentSource(clp.getClass()); - - Map parsedArgs; - - // process the args - if (clp.canAddArgumentsDynamically()) { - // if the command-line program can toss in extra args, fetch them and reparse the arguments. - parser.parse(args); - - // Allow invalid and missing required arguments to pass this validation step. - // - InvalidArgument in case these arguments are specified by plugins. - // - MissingRequiredArgument in case the user requested help. Handle that later, once we've - // determined the full complement of arguments. - if ( ! dryRun ) - parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, - ParsingEngine.ValidationType.InvalidArgument)); - parser.loadArgumentsIntoObject(clp); - - // Initialize the logger using the loaded command line. - clp.setupLoggerLevel(layout); - - Class[] argumentSources = clp.getArgumentSources(); - for (Class argumentSource : argumentSources) - parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource); - parsedArgs = parser.parse(args); - - if (isVersionPresent(parser)) - printVersionAndExit(); - - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); - - if ( ! dryRun ) parser.validate(); - } else { - parsedArgs = parser.parse(args); - - if ( ! dryRun ) { - if (isHelpPresent(parser)) - printHelpAndExit(clp, parser); - - parser.validate(); - } - parser.loadArgumentsIntoObject(clp); - - // Initialize the logger using the loaded command line. - clp.setupLoggerLevel(layout); - } - - if ( ! dryRun ) { - // if they specify a log location, output our data there - if (clp.toFile != null) { - FileAppender appender; - try { - appender = new FileAppender(layout, clp.toFile, false); - logger.addAppender(appender); - } catch (IOException e) { - throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); - } - } - - // regardless of what happens next, generate the header information - HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs); - - // call the execute - CommandLineProgram.result = clp.execute(); - } - } - catch (ArgumentException e) { - //clp.parser.printHelp(clp.getApplicationDetails()); - // Rethrow the exception to exit with an error. - throw e; - } - } - - /** - * Find fields in the object obj that look like command-line arguments, and put command-line - * arguments into them. - * - * @param obj Object to inspect for command line arguments. - */ - public void loadArgumentsIntoObject(Object obj) { - parser.loadArgumentsIntoObject(obj); - } - - /** - * this function checks the logger level passed in on the command line, taking the lowest - * level that was provided. - * @param layout Pattern layout to format based on the logger level. - */ - private void setupLoggerLevel(PatternLayout layout) { - layout.setConversionPattern(patternString); - - // set the default logger level - Level par; - if (logging_level.toUpperCase().equals("DEBUG")) { - par = Level.DEBUG; - } else if (logging_level.toUpperCase().equals("INFO")) { - par = Level.INFO; - } else if (logging_level.toUpperCase().equals("WARN")) { - par = Level.WARN; - } else if (logging_level.toUpperCase().equals("ERROR")) { - par = Level.ERROR; - } else if (logging_level.toUpperCase().equals("FATAL")) { - par = Level.FATAL; - } else if (logging_level.toUpperCase().equals("OFF")) { - par = Level.OFF; - } else { - // we don't understand the logging level, let's get out of here - throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (DEBUG, INFO, WARN, ERROR, FATAL, OFF)"); - } - - Logger.getRootLogger().setLevel(par); - } - - /** - * a function used to indicate an error occurred in the command line tool - */ - private static void printDocumentationReference() { - errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); - errorPrintf("commonly asked questions " + HelpConstants.BASE_GATK_URL + "%n"); - } - - - /** - * Do a cursory search for the given argument. - * - * @param parser Parser - * - * @return True if help is present; false otherwise. - */ - private static boolean isHelpPresent(ParsingEngine parser) { - return parser.isArgumentPresent("help"); - } - - /** - * Print help and exit. - * - * @param clp Instance of the command-line program. - * @param parser True if help is present; false otherwise. - */ - private static void printHelpAndExit(CommandLineProgram clp, ParsingEngine parser) { - parser.printHelp(clp.getApplicationDetails()); - System.exit(0); - } - - /** - * Do a cursory search for the argument "version". - * - * @param parser Parser - * - * @return True if version is present; false otherwise. - */ - private static boolean isVersionPresent(ParsingEngine parser) { - return parser.isArgumentPresent("version"); - } - - /** - * Print help and exit. - */ - private static void printVersionAndExit() { - System.out.println(CommandLineGATK.getVersionNumber().toString()); - System.exit(0); - } - - - private static void errorPrintf(String format, Object... s) { - String formatted = String.format(format, s); - - if ( formatted.trim().equals("") ) - System.err.println("##### ERROR"); - else { - for ( String part : formatted.split("\n") ) { - System.err.println("##### ERROR " + part); - } - } - } - - - /** - * used to indicate an error occured - * - * @param msg the message - * @param t the error - */ - public static void exitSystemWithError(String msg, final Throwable t) { - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("stack trace %n"); - t.printStackTrace(); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This might be a bug. Please check the documentation guide to see if this is a known problem.%n"); - errorPrintf("If not, please post the error message, with stack trace, to the GATK forum.%n"); - printDocumentationReference(); - if ( msg == null ) // some exceptions don't have detailed messages - msg = "Code exception (see stack trace for error itself)"; - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", msg.trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - public static void exitSystemWithUserError(final Exception e) { - if ( e.getMessage() == null ) - throw new ReviewedGATKException("UserException found with no message!", e); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A USER ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This means that one or more arguments or inputs in your command are incorrect.%n"); - errorPrintf("The error message below tells you what is the problem.%n"); - errorPrintf("%n"); - errorPrintf("If the problem is an invalid argument, please check the online documentation guide%n"); - errorPrintf("(or rerun your command with --help) to view allowable command-line arguments for this tool.%n"); - errorPrintf("%n"); - printDocumentationReference(); - errorPrintf("%n"); - errorPrintf("Please do NOT post this error to the GATK forum unless you have really tried to fix it yourself.%n"); - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", e.getMessage().trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - public static void exitSystemWithSamError(final Throwable t) { - if ( t.getMessage() == null ) - throw new ReviewedGATKException("SamException found with no message!", t); - - errorPrintf("------------------------------------------------------------------------------------------%n"); - errorPrintf("A BAM ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber()); - errorPrintf("%n"); - errorPrintf("This means that there is something wrong with the BAM file(s) you provided.%n"); - errorPrintf("The error message below tells you what is the problem.%n"); - errorPrintf("%n"); - printDocumentationReference(); - errorPrintf("%n"); - errorPrintf("Please do NOT post this error to the GATK forum until you have followed these instructions:%n"); - errorPrintf("- Make sure that your BAM file is well-formed by running Picard's validator on it%n"); - errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); - errorPrintf("- Ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); - errorPrintf("%n"); - errorPrintf("MESSAGE: %s%n", t.getMessage().trim()); - errorPrintf("------------------------------------------------------------------------------------------%n"); - System.exit(1); - } - - - /** - * used to indicate an error occured - * - * @param t the exception that occurred - */ - public static void exitSystemWithError(Throwable t) { - exitSystemWithError(t.getMessage(), t); - } - - /** - * A hack to ensure that numbers are always formatted in the US style. - */ - protected static void forceJVMLocaleToUSEnglish() { - Locale.setDefault(Locale.US); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalBinding.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalBinding.java deleted file mode 100644 index 59048a93b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalBinding.java +++ /dev/null @@ -1,106 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import com.google.java.contract.Requires; -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.FeatureReader; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.refdata.ReferenceDependentFeatureCodec; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; - -import java.util.*; - -/** - * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string. - * - * The IntervalBinding is a formal GATK argument that bridges between a walker and - * the engine to construct intervals for traversal at runtime. The IntervalBinding can - * either be a RodBinding, a string of one interval, or a file with interval strings. - * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. - * - * Note that this class is immutable. - */ -public final class IntervalBinding { - - private RodBinding featureIntervals; - private String stringIntervals; - - @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) - public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { - featureIntervals = new RodBinding<>(type, rawName, source, tribbleType, tags); - } - - @Requires({"intervalArgument != null"}) - public IntervalBinding(String intervalArgument) { - stringIntervals = intervalArgument; - } - - public String getSource() { - return ( featureIntervals != null ? featureIntervals.getSource() : stringIntervals ); - } - - public List getIntervals(final GenomeAnalysisEngine toolkit) { - return getIntervals(toolkit.getGenomeLocParser()); - } - - public List getIntervals(final GenomeLocParser genomeLocParser) { - List intervals; - - if ( featureIntervals != null ) { - intervals = new ArrayList<>(); - - // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files - - final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); - if ( codec instanceof ReferenceDependentFeatureCodec ) - ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); - try { - FeatureReader reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false); - for ( Feature feature : reader.iterator() ) - intervals.add(genomeLocParser.createGenomeLoc(feature)); - } catch (Exception e) { - throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e); - } - - } else { - intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); - } - - Collections.sort(intervals); - return intervals; - } - - public String toString() { - return getSource(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/crypt/CryptUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/crypt/CryptUtils.java deleted file mode 100644 index d6ccd3231..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/crypt/CryptUtils.java +++ /dev/null @@ -1,391 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.crypt; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.io.IOUtils; - -import javax.crypto.Cipher; -import java.io.File; -import java.io.InputStream; -import java.security.*; -import java.security.spec.InvalidKeySpecException; -import java.security.spec.KeySpec; -import java.security.spec.PKCS8EncodedKeySpec; -import java.security.spec.X509EncodedKeySpec; -import java.util.Arrays; - -/** - * A set of cryptographic utility methods and constants. - * - * Contains methods to: - * - * -Create a public/private key pair - * -Read and write public/private keys to/from files/streams - * -Load the GATK master private/public keys - * -Encrypt/decrypt data - * - * Also contains constants that control the cryptographic defaults - * throughout the GATK. - * - * @author David Roazen - */ -public class CryptUtils { - - // --------------------------------------------------------------------------------- - // Constants (these control the default cryptographic settings throughout the GATK): - // --------------------------------------------------------------------------------- - - /** - * Default key length in bits of newly-created keys. 2048 bits provides a good balance between - * security and speed. - */ - public static final int DEFAULT_KEY_LENGTH = 2048; - - /** - * Default encryption algorithm to use, when none is specified. - */ - public static final String DEFAULT_ENCRYPTION_ALGORITHM = "RSA"; - - /** - * Default random-number generation algorithm to use, when none is specified. - */ - public static final String DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM = "SHA1PRNG"; - - /** - * Name of the public key file distributed with the GATK. This file is packaged - * into the GATK jar, and we use the system ClassLoader to find it. - */ - public static final String GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME = "GATK_public.key"; - - /** - * Location of the master copy of the GATK private key. - */ - public static final String GATK_MASTER_PRIVATE_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_private.key"; - - /** - * Location of the master copy of the GATK public key. This file should always be the same as - * the public key file distributed with the GATK (and there are automated tests to ensure that it is). - */ - public static final String GATK_MASTER_PUBLIC_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_public.key"; - - /** - * Directory where generated GATK user keys are stored. See the GATKKey class for more information. - */ - public static final String GATK_USER_KEY_DIRECTORY = "/humgen/gsa-hpprojects/GATK/data/gatk_user_keys/"; - - - // ----------------------- - // Utility Methods: - // ----------------------- - - /** - * Generate a new public/private key pair using the default encryption settings defined above. - * - * @return A new public/private key pair created using the default settings - */ - public static KeyPair generateKeyPair() { - return generateKeyPair(DEFAULT_KEY_LENGTH, DEFAULT_ENCRYPTION_ALGORITHM, DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); - } - - /** - * Generate a new public/private key pair using custom encryption settings. - * - * @param keyLength Length of the key in bits - * @param encryptionAlgorithm Encryption algorithm to use - * @param randNumberAlgorithm Random-number generation algorithm to use - * @return A new public/private key pair, created according to the specified parameters - */ - public static KeyPair generateKeyPair( int keyLength, String encryptionAlgorithm, String randNumberAlgorithm ) { - try { - KeyPairGenerator keyGen = KeyPairGenerator.getInstance(encryptionAlgorithm); - SecureRandom randomnessSource = createRandomnessSource(randNumberAlgorithm); - - keyGen.initialize(keyLength, randomnessSource); - return keyGen.generateKeyPair(); - } - catch ( NoSuchAlgorithmException e ) { - throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); - } - catch ( Exception e ) { - throw new ReviewedGATKException("Error while generating key pair", e); - } - } - - /** - * Create a source of randomness using the default random-number generation algorithm. - * - * @return A randomness source that uses the default algorithm - */ - public static SecureRandom createRandomnessSource() { - return createRandomnessSource(DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); - } - - /** - * Create a source of randomness using a custom random-number generation algorithm. - * - * @param randAlgorithm The random-number generation algorithm to use - * @return A randomness sources that uses the specified algorithm - */ - public static SecureRandom createRandomnessSource ( String randAlgorithm ) { - try { - return SecureRandom.getInstance(randAlgorithm); - } - catch ( NoSuchAlgorithmException e ) { - throw new ReviewedGATKException(String.format("Could not find an implementation of the requested random-number generation algorithm %s", randAlgorithm), e); - } - } - - /** - * Writes a public/private key pair to disk - * - * @param keyPair The key pair we're writing to disk - * @param privateKeyFile Location to write the private key - * @param publicKeyFile Location to write the public key - */ - public static void writeKeyPair ( KeyPair keyPair, File privateKeyFile, File publicKeyFile ) { - writeKey(keyPair.getPrivate(), privateKeyFile); - writeKey(keyPair.getPublic(), publicKeyFile); - } - - /** - * Writes an arbitrary key to disk - * - * @param key The key to write - * @param destination Location to write the key to - */ - public static void writeKey ( Key key, File destination ) { - IOUtils.writeByteArrayToFile(key.getEncoded(), destination); - } - - /** - * Reads in a public key created using the default encryption algorithm from a file. - * - * @param source File containing the public key - * @return The public key read - */ - public static PublicKey readPublicKey ( File source ) { - return decodePublicKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); - } - - /** - * Reads in a public key created using the default encryption algorithm from a stream. - * - * @param source Stream attached to the public key - * @return The public key read - */ - public static PublicKey readPublicKey ( InputStream source ) { - return decodePublicKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); - } - - /** - * Decodes the raw bytes of a public key into a usable object. - * - * @param rawKey The encoded bytes of a public key as read from, eg., a file. The - * key must be in the standard X.509 format for a public key. - * @param encryptionAlgorithm The encryption algorithm used to create the public key - * @return The public key as a usable object - */ - public static PublicKey decodePublicKey ( byte[] rawKey, String encryptionAlgorithm ) { - try { - KeySpec keySpec = new X509EncodedKeySpec(rawKey); - KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); - return keyFactory.generatePublic(keySpec); - } - catch ( NoSuchAlgorithmException e ) { - throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); - } - catch ( InvalidKeySpecException e ) { - throw new ReviewedGATKException("Unable to use X.509 key specification to decode the given key", e); - } - } - - /** - * Reads in a private key created using the default encryption algorithm from a file. - * - * @param source File containing the private key - * @return The private key read - */ - public static PrivateKey readPrivateKey ( File source ) { - return decodePrivateKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); - } - - /** - * Reads in a private key created using the default encryption algorithm from a stream. - * - * @param source Stream attached to the private key - * @return The private key read - */ - public static PrivateKey readPrivateKey ( InputStream source ) { - return decodePrivateKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); - } - - /** - * Decodes the raw bytes of a private key into a usable object. - * - * @param rawKey The encoded bytes of a private key as read from, eg., a file. The - * key must be in the standard PKCS #8 format for a private key. - * @param encryptionAlgorithm The encryption algorithm used to create the private key - * @return The private key as a usable object - */ - public static PrivateKey decodePrivateKey ( byte[] rawKey, String encryptionAlgorithm ) { - try { - KeySpec keySpec = new PKCS8EncodedKeySpec(rawKey); - KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); - return keyFactory.generatePrivate(keySpec); - } - catch ( NoSuchAlgorithmException e ) { - throw new ReviewedGATKException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); - } - catch ( InvalidKeySpecException e ) { - throw new ReviewedGATKException("Unable to use the PKCS #8 key specification to decode the given key", e); - } - } - - /** - * Loads the copy of the GATK public key that is distributed with the GATK. Uses the system - * ClassLoader to locate the public key file, which should be stored at the root of the GATK - * jar file. - * - * @return The GATK public key as a usable object - */ - public static PublicKey loadGATKDistributedPublicKey() { - InputStream publicKeyInputStream = ClassLoader.getSystemResourceAsStream(GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME); - - if ( publicKeyInputStream == null ) { - throw new ReviewedGATKException(String.format("Could not locate the GATK public key %s in the classpath", - GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME)); - } - - return readPublicKey(publicKeyInputStream); - } - - /** - * Loads the master copy of the GATK private key. You must have the appropriate UNIX permissions - * to do this! - * - * @return The GATK master private key as a usable object - */ - public static PrivateKey loadGATKMasterPrivateKey() { - return readPrivateKey(new File(GATK_MASTER_PRIVATE_KEY_FILE)); - } - - /** - * Loads the master copy of the GATK public key. This should always be the same as the - * public key distributed with the GATK returned by loadGATKDistributedPublicKey(). - * - * @return The GATK master public key as a usable object - */ - public static PublicKey loadGATKMasterPublicKey() { - return readPublicKey(new File(GATK_MASTER_PUBLIC_KEY_FILE)); - } - - /** - * Encrypts the given data using the key provided. - * - * @param data The data to encrypt, as a byte array - * @param encryptKey The key with which to encrypt the data - * @return The encrypted version of the provided data - */ - public static byte[] encryptData ( byte[] data, Key encryptKey ) { - return transformDataUsingCipher(data, encryptKey, Cipher.ENCRYPT_MODE); - } - - /** - * Decrypts the given data using the key provided. - * - * @param encryptedData Data to decrypt, as a byte array - * @param decryptKey The key with which to decrypt the data - * @return The decrypted version of the provided data - */ - public static byte[] decryptData ( byte[] encryptedData, Key decryptKey ) { - return transformDataUsingCipher(encryptedData, decryptKey, Cipher.DECRYPT_MODE); - } - - /** - * Helper method for encryption/decryption that takes data and processes it using - * the given key - * - * @param data Data to encrypt/decrypt - * @param key Key to use to encrypt/decrypt the data - * @param cipherMode Specifies whether we are encrypting or decrypting - * @return The encrypted/decrypted data - */ - private static byte[] transformDataUsingCipher ( byte[] data, Key key, int cipherMode ) { - try { - Cipher cipher = Cipher.getInstance(key.getAlgorithm()); - cipher.init(cipherMode, key); - return cipher.doFinal(data); - } - catch ( NoSuchAlgorithmException e ) { - throw new ReviewedGATKException(String.format("Could not find an implementation of the requested algorithm %s", - key.getAlgorithm()), e); - } - catch ( InvalidKeyException e ) { - throw new ReviewedGATKException("Key is invalid", e); - } - catch ( GeneralSecurityException e ) { - throw new ReviewedGATKException("Error during encryption", e); - } - } - - /** - * Tests whether the public/private keys provided can each decrypt data encrypted by - * the other key -- ie., tests whether these two keys are part of the same public/private - * key pair. - * - * @param privateKey The private key to test - * @param publicKey The public key to test - * @return True if the keys are part of the same key pair and can decrypt each other's - * encrypted data, otherwise false. - */ - public static boolean keysDecryptEachOther ( PrivateKey privateKey, PublicKey publicKey ) { - byte[] plainText = "Test PlainText".getBytes(); - - byte[] dataEncryptedUsingPrivateKey = CryptUtils.encryptData(plainText, privateKey); - byte[] dataEncryptedUsingPublicKey = CryptUtils.encryptData(plainText, publicKey); - - byte[] privateKeyDataDecryptedWithPublicKey = CryptUtils.decryptData(dataEncryptedUsingPrivateKey, publicKey); - byte[] publicKeyDataDecryptedWithPrivateKey = CryptUtils.decryptData(dataEncryptedUsingPublicKey, privateKey); - - // Make sure we actually transformed the data during encryption: - if ( Arrays.equals(plainText, dataEncryptedUsingPrivateKey) || - Arrays.equals(plainText, dataEncryptedUsingPublicKey) || - Arrays.equals(dataEncryptedUsingPrivateKey, dataEncryptedUsingPublicKey) ) { - return false; - } - - // Make sure that we were able to recreate the original plaintext using - // both the public key on the private-key-encrypted data and the private - // key on the public-key-encrypted data: - if ( ! Arrays.equals(plainText, privateKeyDataDecryptedWithPublicKey) || - ! Arrays.equals(plainText, publicKeyDataDecryptedWithPrivateKey) ) { - return false; - } - - return true; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/crypt/GATKKey.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/crypt/GATKKey.java deleted file mode 100644 index ab21a2a17..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/crypt/GATKKey.java +++ /dev/null @@ -1,350 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.crypt; - -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.io.IOUtils; - -import java.io.*; -import java.security.*; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -/** - * Class to represent a GATK user key. - * - * A GATK user key contains an email address and a cryptographic signature. - * The signature is the SHA-1 hash of the email address encrypted using - * the GATK master private key. The GATK master public key (distributed - * with the GATK) is used to decrypt the signature and validate the key - * at the start of each GATK run that requires a key. - * - * Keys are cryptographically secure in that valid keys definitely come - * from us and cannot be fabricated, however nothing prevents keys from - * being shared between users. - * - * GATK user keys have the following on-disk format: - * - * GZIP Container: - * Email address - * NUL byte (delimiter) - * Cryptographic Signature (encrypted SHA-1 hash of email address) - * - * The key data is wrapped within a GZIP container to placate over-zealous - * email filters (since keys must often be emailed) and also to provide an - * additional integrity check via the built-in GZIP CRC. - * - * @author David Roazen - */ -public class GATKKey { - - /** - * Private key used to sign the GATK key. Required only when creating a new - * key from scratch, not when loading an existing key from disk. - */ - private PrivateKey privateKey; - - /** - * Public key used to validate the GATK key. - */ - private PublicKey publicKey; - - /** - * The user's email address, stored within the key and signed. - */ - private String emailAddress; - - /** - * The cryptographic signature of the email address. By default, this is - * the SHA-1 hash of the email address encrypted using the RSA algorithm. - */ - private byte[] signature; - - /** - * The combination of hash/encryption algorithms to use to generate the signature. - * By default this is "SHA1withRSA" - */ - private String signingAlgorithm; - - /** - * Default hash/encryption algorithms to use to sign the key. - */ - public static final String DEFAULT_SIGNING_ALGORITHM = "SHA1withRSA"; - - /** - * Byte value used to separate the email address from its signature in the key file. - */ - public static final byte GATK_KEY_SECTIONAL_DELIMITER = 0; - - - // ----------------------- - // Constructors: - // ----------------------- - - /** - * Constructor to create a new GATK key from scratch using an email address - * and public/private key pair. The private key is used for signing, and the - * public key is used to validate the newly-created key. - * - * @param privateKey Private key used to sign the new GATK key - * @param publicKey Public key used to validate the new GATK key - * @param emailAddress The user's email address, which we will store in the key and sign - */ - public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress ) { - this(privateKey, publicKey, emailAddress, DEFAULT_SIGNING_ALGORITHM); - } - - /** - * Constructor to create a new GATK key from scratch using an email address - * and public/private key pair, and additionally specify the signing algorithm - * to use. The private key is used for signing, and the public key is used to - * validate the newly-created key. - * - * @param privateKey Private key used to sign the new GATK key - * @param publicKey Public key used to validate the new GATK key - * @param emailAddress The user's email address, which we will store in the key and sign - * @param signingAlgorithm The combination of hash and encryption algorithms to use to sign the key - */ - public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress, String signingAlgorithm ) { - if ( privateKey == null || publicKey == null || emailAddress == null || emailAddress.length() == 0 || signingAlgorithm == null ) { - throw new ReviewedGATKException("Cannot construct GATKKey using null/empty arguments"); - } - - this.privateKey = privateKey; - this.publicKey = publicKey; - this.emailAddress = emailAddress; - this.signingAlgorithm = signingAlgorithm; - - validateEmailAddress(); - generateSignature(); - - if ( ! isValid() ) { - throw new ReviewedGATKException("Newly-generated GATK key fails validation -- this should never happen!"); - } - } - - /** - * Constructor to load an existing GATK key from a file. - * - * During loading, the key file is checked for integrity, but not cryptographic - * validity (which must be done through a subsequent call to isValid()). - * - * @param publicKey Public key that will be used to validate the loaded GATK key - * in subsequent calls to isValid() - * @param keyFile File containing the GATK key to load - */ - public GATKKey ( PublicKey publicKey, File keyFile ) { - this(publicKey, keyFile, DEFAULT_SIGNING_ALGORITHM); - } - - /** - * Constructor to load an existing GATK key from a file, and additionally specify - * the signing algorithm used to sign the key being loaded. - * - * During loading, the key file is checked for integrity, but not cryptographic - * validity (which must be done through a subsequent call to isValid()). - * - * @param publicKey Public key that will be used to validate the loaded GATK key - * in subsequent calls to isValid() - * @param keyFile File containing the GATK key to load - * @param signingAlgorithm The combination of hash and encryption algorithms used to sign the key - */ - public GATKKey ( PublicKey publicKey, File keyFile, String signingAlgorithm ) { - if ( publicKey == null || keyFile == null || signingAlgorithm == null ) { - throw new ReviewedGATKException("Cannot construct GATKKey using null arguments"); - } - - this.publicKey = publicKey; - this.signingAlgorithm = signingAlgorithm; - - readKey(keyFile); - } - - // ----------------------- - // Public API Methods: - // ----------------------- - - /** - * Writes out this key to a file in the format described at the top of this class, - * encapsulating the key within a GZIP container. - * - * @param destination File to write the key to - */ - public void writeKey ( File destination ) { - try { - byte[] keyBytes = marshalKeyData(); - IOUtils.writeByteArrayToStream(keyBytes, new GZIPOutputStream(new FileOutputStream(destination))); - } - catch ( IOException e ) { - throw new UserException.CouldNotCreateOutputFile(destination, e); - } - } - - /** - * Checks whether the signature of this key is cryptographically valid (ie., can be - * decrypted by the public key to produce a valid SHA-1 hash of the email address - * in the key). - * - * @return True if the key's signature passes validation, otherwise false - */ - public boolean isValid() { - try { - Signature sig = Signature.getInstance(signingAlgorithm); - sig.initVerify(publicKey); - sig.update(emailAddress.getBytes()); - return sig.verify(signature); - } - catch ( NoSuchAlgorithmException e ) { - throw new ReviewedGATKException(String.format("Signing algorithm %s not found", signingAlgorithm), e); - } - catch ( InvalidKeyException e ) { - // If the GATK public key is invalid, it's likely our problem, not the user's: - throw new ReviewedGATKException(String.format("Public key %s is invalid", publicKey), e); - } - catch ( SignatureException e ) { - throw new UserException.UnreadableKeyException("Signature is invalid or signing algorithm was unable to process the input data", e); - } - } - - // ----------------------- - // Private Helper Methods: - // ----------------------- - - /** - * Helper method that creates a signature for this key using the combination of - * hash/encryption algorithms specified at construction time. - */ - private void generateSignature() { - try { - Signature sig = Signature.getInstance(signingAlgorithm); - sig.initSign(privateKey, CryptUtils.createRandomnessSource()); - sig.update(emailAddress.getBytes()); - signature = sig.sign(); - } - catch ( NoSuchAlgorithmException e ) { - throw new ReviewedGATKException(String.format("Signing algorithm %s not found", signingAlgorithm), e); - } - catch ( InvalidKeyException e ) { - throw new ReviewedGATKException(String.format("Private key %s is invalid", privateKey), e); - } - catch ( SignatureException e ) { - throw new ReviewedGATKException(String.format("Error creating signature for email address %s", emailAddress), e); - } - } - - /** - * Helper method that reads in a GATK key from a file. Should not be called directly -- - * use the appropriate constructor above. - * - * @param source File to read the key from - */ - private void readKey ( File source ) { - try { - byte[] keyBytes = IOUtils.readStreamIntoByteArray(new GZIPInputStream(new FileInputStream(source))); - - // As a sanity check, compare the number of bytes read to the uncompressed file size - // stored in the GZIP ISIZE field. If they don't match, the key must be corrupt: - if ( keyBytes.length != IOUtils.getGZIPFileUncompressedSize(source) ) { - throw new UserException.UnreadableKeyException("Number of bytes read does not match the uncompressed size specified in the GZIP ISIZE field"); - } - - unmarshalKeyData(keyBytes); - } - catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(source, e); - } - catch ( IOException e ) { - throw new UserException.UnreadableKeyException(source, e); - } - catch ( UserException.CouldNotReadInputFile e ) { - throw new UserException.UnreadableKeyException(source, e); - } - } - - /** - * Helper method that assembles the email address and signature into a format - * suitable for writing to disk. - * - * @return The aggregated key data, ready to be written to disk - */ - private byte[] marshalKeyData() { - byte[] emailAddressBytes = emailAddress.getBytes(); - byte[] assembledKey = new byte[emailAddressBytes.length + 1 + signature.length]; - - System.arraycopy(emailAddressBytes, 0, assembledKey, 0, emailAddressBytes.length); - assembledKey[emailAddressBytes.length] = GATK_KEY_SECTIONAL_DELIMITER; - System.arraycopy(signature, 0, assembledKey, emailAddressBytes.length + 1, signature.length); - - return assembledKey; - } - - /** - * Helper method that parses the raw key data from disk into its component - * email address and signature. Performs some basic validation in the process. - * - * @param keyBytes The raw, uncompressed key data read from disk - */ - private void unmarshalKeyData ( byte[] keyBytes ) { - int delimiterPosition = -1; - - for ( int i = 0; i < keyBytes.length; i++ ) { - if ( keyBytes[i] == GATK_KEY_SECTIONAL_DELIMITER ) { - delimiterPosition = i; - break; - } - } - - if ( delimiterPosition == -1 ) { - throw new UserException.UnreadableKeyException("Malformed GATK key contains no sectional delimiter"); - } - else if ( delimiterPosition == 0 ) { - throw new UserException.UnreadableKeyException("Malformed GATK key contains no email address"); - } - else if ( delimiterPosition == keyBytes.length - 1 ) { - throw new UserException.UnreadableKeyException("Malformed GATK key contains no signature"); - } - - byte[] emailAddressBytes = new byte[delimiterPosition]; - System.arraycopy(keyBytes, 0, emailAddressBytes, 0, delimiterPosition); - emailAddress = new String(emailAddressBytes); - - signature = new byte[keyBytes.length - delimiterPosition - 1]; - System.arraycopy(keyBytes, delimiterPosition + 1, signature, 0, keyBytes.length - delimiterPosition - 1); - } - - /** - * Helper method that ensures that the user's email address does not contain the NUL byte, which we - * reserve as a delimiter within each key file. - */ - private void validateEmailAddress() { - for ( byte b : emailAddress.getBytes() ) { - if ( b == GATK_KEY_SECTIONAL_DELIMITER ) { - throw new UserException(String.format("Email address must not contain a byte with value %d", GATK_KEY_SECTIONAL_DELIMITER)); - } - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java deleted file mode 100644 index 05f2ccfd7..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java +++ /dev/null @@ -1,311 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.fasta; - -import picard.PicardException; -import htsjdk.samtools.reference.FastaSequenceIndex; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequence; -import htsjdk.samtools.SAMSequenceRecord; -import htsjdk.samtools.util.StringUtil; -import org.apache.log4j.Priority; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.BaseUtils; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.Arrays; - -/** - * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. - * - * Thread-safe! Uses a thread-local cache. - * - * Automatically upper-cases the bases coming in, unless the flag preserveCase is explicitly set. - * Automatically converts IUPAC bases to Ns, unless the flag preserveIUPAC is explicitly set. - */ -public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { - protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); - - /** do we want to print debugging information about cache efficiency? */ - private static final boolean PRINT_EFFICIENCY = false; - - /** If we are printing efficiency info, what frequency should we do it at? */ - private static final int PRINT_FREQUENCY = 10000; - - /** The default cache size in bp */ - public static final long DEFAULT_CACHE_SIZE = 1000000; - - /** The cache size of this CachingIndexedFastaSequenceFile */ - private final long cacheSize; - - /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ - private final long cacheMissBackup; - - /** - * If true, we will preserve the case of the original base in the genome - */ - private final boolean preserveCase; - - /** - * If true, we will preserve the IUPAC bases in the genome - */ - private final boolean preserveIUPAC; - - // information about checking efficiency - long cacheHits = 0; - long cacheMisses = 0; - - /** Represents a specific cached sequence, with a specific start and stop, as well as the bases */ - private static class Cache { - long start = -1, stop = -1; - ReferenceSequence seq = null; - } - - /** - * Thread local cache to allow multi-threaded use of this class - */ - private ThreadLocal cache; - { - cache = new ThreadLocal () { - @Override protected Cache initialValue() { - return new Cache(); - } - }; - } - - /** - * Same as general constructor but allows one to override the default cacheSize - * - * @param fasta the file we will read our FASTA sequence from. - * @param index the index of the fasta file, used for efficient random access - * @param cacheSize the size in bp of the cache we will use for this reader - * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case - * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns - */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) { - super(fasta, index); - if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); - this.cacheSize = cacheSize; - this.cacheMissBackup = Math.max(cacheSize / 1000, 1); - this.preserveCase = preserveCase; - this.preserveIUPAC = preserveIUPAC; - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * Looks for a index file for fasta on disk - * Uses provided cacheSize instead of the default - * - * @param fasta The file to open. - * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 - * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case - */ - public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) throws FileNotFoundException { - super(fasta); - if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); - this.cacheSize = cacheSize; - this.cacheMissBackup = Math.max(cacheSize / 1000, 1); - this.preserveCase = preserveCase; - this.preserveIUPAC = preserveIUPAC; - } - - /** - * Same as general constructor but allows one to override the default cacheSize - * - * By default, this CachingIndexedFastaReader converts all incoming bases to upper case - * - * @param fasta the file we will read our FASTA sequence from. - * @param index the index of the fasta file, used for efficient random access - * @param cacheSize the size in bp of the cache we will use for this reader - */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { - this(fasta, index, cacheSize, false, false); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * Looks for a index file for fasta on disk. - * This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood - * - * @param fasta The file to open. - */ - public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { - this(fasta, false); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * Looks for a index file for fasta on disk - * - * @param fasta The file to open. - * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case - */ - public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { - this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * Looks for a index file for fasta on disk - * Uses provided cacheSize instead of the default - * - * @param fasta The file to open. - * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 - */ - public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { - this(fasta, cacheSize, false, false); - } - - /** - * Print the efficiency (hits / queries) to logger with priority - */ - public void printEfficiency(final Priority priority) { - logger.log(priority, String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%", cacheHits, cacheMisses, calcEfficiency())); - } - - /** - * Returns the efficiency (% of hits of all queries) of this object - * @return - */ - public double calcEfficiency() { - return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0); - } - - /** - * @return the number of cache hits that have occurred - */ - public long getCacheHits() { - return cacheHits; - } - - /** - * @return the number of cache misses that have occurred - */ - public long getCacheMisses() { - return cacheMisses; - } - - /** - * @return the size of the cache we are using - */ - public long getCacheSize() { - return cacheSize; - } - - /** - * Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is - * everything being made upper case? - * - * @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased - */ - public boolean isPreservingCase() { - return preserveCase; - } - - /** - * Is uppercasing bases? - * - * @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta - */ - public boolean isUppercasingBases() { - return ! isPreservingCase(); - } - - /** - * Is this CachingIndexedFastaReader keeping the IUPAC bases in the fasta, or is it turning them into Ns? - * - * @return true if the IUPAC bases coming from this reader are not modified - */ - public boolean isPreservingIUPAC() { - return preserveIUPAC; - } - - /** - * Gets the subsequence of the contig in the range [start,stop] - * - * Uses the sequence cache if possible, or updates the cache to handle the request. If the range - * is larger than the cache itself, just loads the sequence directly, not changing the cache at all - * - * @param contig Contig whose subsequence to retrieve. - * @param start inclusive, 1-based start of region. - * @param stop inclusive, 1-based stop of region. - * @return The partial reference sequence associated with this range. If preserveCase is false, then - * all of the bases in the ReferenceSequence returned by this method will be upper cased. - */ - @Override - public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) { - final ReferenceSequence result; - final Cache myCache = cache.get(); - - if ( (stop - start) >= cacheSize ) { - cacheMisses++; - result = super.getSubsequenceAt(contig, start, stop); - if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); - if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1); - } else { - // todo -- potential optimization is to check if contig.name == contig, as this in general will be true - SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); - - if (stop > contigInfo.getSequenceLength()) - throw new PicardException("Query asks for data past end of contig"); - - if ( start < myCache.start || stop > myCache.stop || myCache.seq == null || myCache.seq.getContigIndex() != contigInfo.getSequenceIndex() ) { - cacheMisses++; - myCache.start = Math.max(start - cacheMissBackup, 0); - myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength()); - myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); - - // convert all of the bases in the sequence to upper case if we aren't preserving cases - if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); - if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0); - } else { - cacheHits++; - } - - // at this point we determine where in the cache we want to extract the requested subsequence - final int cacheOffsetStart = (int)(start - myCache.start); - final int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); - - try { - result = new ReferenceSequence(myCache.seq.getName(), myCache.seq.getContigIndex(), Arrays.copyOfRange(myCache.seq.getBases(), cacheOffsetStart, cacheOffsetStop)); - } catch ( ArrayIndexOutOfBoundsException e ) { - throw new ReviewedGATKException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d", - myCache.start, myCache.stop, start, stop, cacheOffsetStart, cacheOffsetStop, myCache.seq.getBases().length), e); - } - } - - // for debugging -- print out our efficiency if requested - if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 ) - printEfficiency(Priority.INFO); - - return result; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java deleted file mode 100644 index 1dd8a8a1f..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ /dev/null @@ -1,413 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.genotyper; - - -import com.google.java.contract.Ensures; -import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import org.broadinstitute.gatk.utils.sam.AlignmentUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import htsjdk.variant.variantcontext.Allele; - -import java.util.*; - -/** - * Wrapper class that holds a set of maps of the form (Read -> Map(Allele->Double)) - * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. - */ -public class PerReadAlleleLikelihoodMap { - /** A set of all of the allele, so we can efficiently determine if an allele is already present */ - private final Map allelesSet = new HashMap<>(); - /** A list of the unique allele, as an ArrayList so we can call get(i) efficiently */ - protected final List alleles = new ArrayList<>(); - - - - protected final Map> likelihoodReadMap = new LinkedHashMap<>(); - - public PerReadAlleleLikelihoodMap() { } - - /** - * Add a new entry into the Read -> ( Allele -> Likelihood ) map of maps. - * @param read - the GATKSAMRecord that was evaluated - * @param a - the Allele against which the GATKSAMRecord was evaluated - * @param likelihood - the likelihood score resulting from the evaluation of "read" against "a" - */ - public void add(final GATKSAMRecord read, final Allele a, final Double likelihood) { - if ( read == null ) throw new IllegalArgumentException("Cannot add a null read to the allele likelihood map"); - if ( a == null ) throw new IllegalArgumentException("Cannot add a null allele to the allele likelihood map"); - if ( likelihood == null ) throw new IllegalArgumentException("Likelihood cannot be null"); - if ( likelihood > 0.0 ) throw new IllegalArgumentException("Likelihood must be negative (L = log(p))"); - - if (!allelesSet.containsKey(a)) { - allelesSet.put(a,alleles.size()); - alleles.add(a); - } - Map likelihoodMap = likelihoodReadMap.get(read); - if (likelihoodMap == null){ - // LinkedHashMap will ensure iterating through alleles will be in consistent order - likelihoodMap = new LinkedHashMap<>(); - likelihoodReadMap.put(read,likelihoodMap); - } - - likelihoodMap.put(a,likelihood); - - - } - - public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { - return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction); - } - - /** - * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion - * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination - * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed. - */ - public void performPerAlleleDownsampling(final double downsamplingFraction) { - // special case removal of all or no reads - if ( downsamplingFraction <= 0.0 ) - return; - if ( downsamplingFraction >= 1.0 ) { - likelihoodReadMap.clear(); - return; - } - - // start by stratifying the reads by the alleles they represent at this position - final Map> alleleReadMap = getAlleleStratifiedReadMap(); - - // compute the reads to remove and actually remove them - final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction); - for ( final GATKSAMRecord read : readsToRemove ) - likelihoodReadMap.remove(read); - } - - /** - * Convert the @likelihoodReadMap to a map of alleles to reads, where each read is mapped uniquely to the allele - * for which it has the greatest associated likelihood - * @return a map from each allele to a list of reads that 'support' the allele - */ - protected Map> getAlleleStratifiedReadMap() { - final Map> alleleReadMap = new HashMap<>(alleles.size()); - for ( final Allele allele : alleles ) - alleleReadMap.put(allele, new ArrayList()); - - for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); - if ( bestAllele.isInformative() ) - alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); - } - - return alleleReadMap; - } - - @Ensures("result >=0") - public int size() { - return likelihoodReadMap.size(); - } - - /** - * Helper function to add the read underneath a pileup element to the map - * @param p Pileup element - * @param a Corresponding allele - * @param likelihood Allele likelihood - */ - public void add(PileupElement p, Allele a, Double likelihood) { - if (p==null) - throw new IllegalArgumentException("Pileup element cannot be null"); - if ( p.getRead()==null ) - throw new IllegalArgumentException("Read underlying pileup element cannot be null"); - if ( a == null ) - throw new IllegalArgumentException("Allele for add() cannot be null"); - - add(p.getRead(), a, likelihood); - } - - /** - * Does the current map contain the key associated with a particular SAM record in pileup? - * @param p Pileup element - * @return true if the map contains pileup element, else false - */ - public boolean containsPileupElement(final PileupElement p) { - return likelihoodReadMap.containsKey(p.getRead()); - } - - public boolean isEmpty() { - return likelihoodReadMap.isEmpty(); - } - - public Map> getLikelihoodReadMap() { - return likelihoodReadMap; - } - - public void clear() { - allelesSet.clear(); - alleles.clear(); - likelihoodReadMap.clear(); - } - - public Set getStoredElements() { - return likelihoodReadMap.keySet(); - } - -// public Collection> getLikelihoodMapValues() { -// return likelihoodReadMap.values(); -// } - - public int getNumberOfStoredElements() { - return likelihoodReadMap.size(); - } - - public Map getLikelihoodsAssociatedWithPileupElement(final PileupElement p) { - if (!likelihoodReadMap.containsKey(p.getRead())) - return null; - - return likelihoodReadMap.get(p.getRead()); - } - - - /** - * Get the log10 likelihood associated with an individual read/allele - * - * @param read the read whose likelihood we want - * @param allele the allele whose likelihood we want - * @return the log10 likelihood that this read matches this allele - */ - public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ - if (!allelesSet.containsKey(allele) || !likelihoodReadMap.containsKey(read)) - return 0.0; - - return likelihoodReadMap.get(read).get(allele); - } - - /** - * Get the most likely alleles estimated across all reads in this object - * - * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for - * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum - * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. - * - * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele - * doesn't have a meaningful get best likelihood. - * - * @return a MostLikelyAllele object, or null if this map is empty - */ - public MostLikelyAllele getMostLikelyDiploidAlleles() { - if ( isEmpty() ) return null; - - int hap1 = 0; - int hap2 = 0; - double maxElement = Double.NEGATIVE_INFINITY; - for( int iii = 0; iii < alleles.size(); iii++ ) { - final Allele iii_allele = alleles.get(iii); - for( int jjj = 0; jjj <= iii; jjj++ ) { - final Allele jjj_allele = alleles.get(jjj); - - double haplotypeLikelihood = 0.0; - for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) - final double likelihood_iii = entry.getValue().get(iii_allele); - final double likelihood_jjj = entry.getValue().get(jjj_allele); - haplotypeLikelihood += MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF; - - // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair - if ( haplotypeLikelihood < maxElement ) break; - } - - // keep track of the max element and associated indices - if ( haplotypeLikelihood > maxElement ) { - hap1 = iii; - hap2 = jjj; - maxElement = haplotypeLikelihood; - } - } - } - - if ( maxElement == Double.NEGATIVE_INFINITY ) - throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); - - return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); - } - - /** - * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * - * @param alleleMap - a map from alleles to likelihoods - * @return - a MostLikelyAllele object - */ - @Ensures("result != null") - public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap ) { - return getMostLikelyAllele(alleleMap, null); - } - - /** - * Given a map from alleles to likelihoods, find the allele with the largest likelihood. - * - * @param alleleMap - a map from alleles to likelihoods - * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. - * this is useful for the case where you've selected a subset of the alleles that - * the reads have been computed for further analysis. If null totally ignored - * @return - a MostLikelyAllele object - */ - public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { - if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); - double maxLike = Double.NEGATIVE_INFINITY; - double prevMaxLike = Double.NEGATIVE_INFINITY; - Allele mostLikelyAllele = Allele.NO_CALL; - Allele secondMostLikely = null; - - for (final Map.Entry el : alleleMap.entrySet()) { - if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) - continue; - - if (el.getValue() > maxLike) { - prevMaxLike = maxLike; - maxLike = el.getValue(); - secondMostLikely = mostLikelyAllele; - mostLikelyAllele = el.getKey(); - } else if( el.getValue() > prevMaxLike ) { - secondMostLikely = el.getKey(); - prevMaxLike = el.getValue(); - } - } - - return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); - } - - /** - * Debug method to dump contents of object into string for display - */ - public String toString() { - final StringBuilder sb = new StringBuilder(); - - sb.append("Alelles in map:"); - for (final Allele a:alleles) { - sb.append(a.getDisplayString()+","); - } - sb.append("\n"); - for (final Map.Entry > el : getLikelihoodReadMap().entrySet() ) { - for (final Map.Entry eli : el.getValue().entrySet()) { - sb.append("Read "+el.getKey().getReadName()+". Allele:"+eli.getKey().getDisplayString()+" has likelihood="+Double.toString(eli.getValue())+"\n"); - } - - } - return sb.toString(); - } - - /** - * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods - * - * Goes through each read in this map, and if it is poorly modelled removes it from the map. - * - * @see #readIsPoorlyModelled(org.broadinstitute.gatk.utils.sam.GATKSAMRecord, java.util.Collection, double) - * for more information about the poorly modelled test. - * - * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled - * @return the list of reads removed from this map because they are poorly modelled - */ - public List filterPoorlyModelledReads(final double maxErrorRatePerBase) { - final List removedReads = new LinkedList<>(); - final Iterator>> it = likelihoodReadMap.entrySet().iterator(); - while ( it.hasNext() ) { - final Map.Entry> record = it.next(); - if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) { - it.remove(); - removedReads.add(record.getKey()); - } - } - - return removedReads; - } - - /** - * Is this read poorly modelled by all of the alleles in this map? - * - * A read is poorly modeled when it's likelihood is below what would be expected for a read - * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. - * - * This function makes a number of key assumptions. First, that the likelihoods reflect the total likelihood - * of the read. In other words, that the read would be fully explained by one of the alleles. This means - * that the allele should be something like the full haplotype from which the read might originate. - * - * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base). So - * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors - * even against the true haplotype. So for this read to be well modelled by at least one allele we'd expect - * a likelihood to be >= 10 * -3. - * - * @param read the read we want to evaluate - * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes. - * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space. So - * 0.01 means a 1% error rate - * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes - */ - protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection log10Likelihoods, final double maxErrorRatePerBase) { - final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); - final double log10QualPerBase = -4.0; - final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; - - for ( final double log10Likelihood : log10Likelihoods ) - if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele ) - return false; - - return true; - } - - /** - * Get an unmodifiable set of the unique alleles in this PerReadAlleleLikelihoodMap - * @return a non-null unmodifiable map - */ - public Set getAllelesSet() { - return Collections.unmodifiableSet(allelesSet.keySet()); - } - - /** - * Loop over all of the reads in this likelihood map and realign them to its most likely haplotype - * @param haplotypes the collection of haplotypes - * @param paddedReferenceLoc the active region - */ - public void realignReadsToMostLikelyHaplotype(final Collection haplotypes, final GenomeLoc paddedReferenceLoc) { - - // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently - final Map alleleToHaplotypeMap = new HashMap<>(haplotypes.size()); - for ( final Haplotype haplotype : haplotypes ) - alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); - - final Map> newLikelihoodReadMap = new LinkedHashMap<>(likelihoodReadMap.size()); - for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); - final GATKSAMRecord alignedToRef = AlignmentUtils.createReadAlignedToRef(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative()); - newLikelihoodReadMap.put(alignedToRef, entry.getValue()); - } - - likelihoodReadMap.clear(); - likelihoodReadMap.putAll(newLikelihoodReadMap); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java deleted file mode 100644 index fa9fc300d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java +++ /dev/null @@ -1,1587 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.genotyper; - -import htsjdk.variant.variantcontext.Allele; -import it.unimi.dsi.fastutil.ints.IntArrayList; -import it.unimi.dsi.fastutil.objects.Object2IntMap; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; -import org.broadinstitute.gatk.engine.downsampling.AlleleBiasedDownsamplingUtils; -import org.broadinstitute.gatk.tools.walkers.genotyper.*; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; - -import java.util.*; - -/** - * Read-likelihoods container implementation based on integer indexed arrays. - * - * @param the type of the allele the likelihood makes reference to. - * - * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> - */ -public class ReadLikelihoods implements SampleList, AlleleList, Cloneable { - - /** - * Reads by sample index. Each sub array contains reference to the reads of the ith sample. - */ - private GATKSAMRecord[][] readsBySampleIndex; - - /** - * Indexed per sample, allele and finally read (within sample). - *

- * valuesBySampleIndex[s][a][r] == lnLk(R_r | A_a) where R_r comes from Sample s. - *

- */ - private double[][][] valuesBySampleIndex; - - /** - * Sample list - */ - private final SampleList samples; - - /** - * Allele list - */ - private AlleleList
alleles; - - /** - * Cached allele list. - */ - private List alleleList; - - /** - * Cached sample list. - */ - private List sampleList; - - /** - * Maps from each read to its index within the sample. - * - *

In order to save CPU time the indices contained in this array (not the array itself) is - * lazily initialized by invoking {@link #readIndexBySampleIndex(int)}.

- */ - private final Object2IntMap[] readIndexBySampleIndex; - - /** - * Index of the reference allele if any, otherwise -1 - */ - private int referenceAlleleIndex = -1; - - /** - * Caches the read-list per sample list returned by {@link #sampleReads} - */ - private final List[] readListBySampleIndex; - - /** - * Sample matrices lazily initialized (the elements not the array) by invoking {@link #sampleMatrix(int)}. - */ - private final Matrix
[] sampleMatrices; - - /** - * Constructs a new read-likelihood collection. - * - *

- * The initial likelihoods for all allele-read combinations are - * 0. - *

- * - * @param samples all supported samples in the collection. - * @param alleles all supported alleles in the collection. - * @param reads reads stratified per sample. - * - * @throws IllegalArgumentException if any of {@code allele}, {@code samples} - * or {@code reads} is {@code null}, - * or if they contain null values. - */ - @SuppressWarnings("unchecked") - public ReadLikelihoods(final SampleList samples, final AlleleList
alleles, - final Map> reads) { - if (alleles == null) - throw new IllegalArgumentException("allele list cannot be null"); - if (samples == null) - throw new IllegalArgumentException("sample list cannot be null"); - if (reads == null) - throw new IllegalArgumentException("read map cannot be null"); - - this.samples = samples; - this.alleles = alleles; - - final int sampleCount = samples.sampleCount(); - final int alleleCount = alleles.alleleCount(); - - readsBySampleIndex = new GATKSAMRecord[sampleCount][]; - readListBySampleIndex = new List[sampleCount]; - valuesBySampleIndex = new double[sampleCount][][]; - referenceAlleleIndex = findReferenceAllele(alleles); - - readIndexBySampleIndex = new Object2IntMap[sampleCount]; - - setupIndexes(reads, sampleCount, alleleCount); - - sampleMatrices = (Matrix[]) new Matrix[sampleCount]; - } - - // Add all the indices to alleles, sample and reads in the look-up maps. - private void setupIndexes(final Map> reads, final int sampleCount, final int alleleCount) { - for (int i = 0; i < sampleCount; i++) - setupSampleData(i, reads, alleleCount); - } - - // Assumes that {@link #samples} has been initialized with the sample names. - private void setupSampleData(final int sampleIndex, final Map> readsBySample, - final int alleleCount) { - final String sample = samples.sampleAt(sampleIndex); - - final List reads = readsBySample.get(sample); - readsBySampleIndex[sampleIndex] = reads == null - ? new GATKSAMRecord[0] - : reads.toArray(new GATKSAMRecord[reads.size()]); - final int sampleReadCount = readsBySampleIndex[sampleIndex].length; - - final double[][] sampleValues = new double[alleleCount][sampleReadCount]; - valuesBySampleIndex[sampleIndex] = sampleValues; - } - - /** - * Create an independent copy of this read-likelihoods collection - */ - public ReadLikelihoods clone() { - - final int sampleCount = samples.sampleCount(); - final int alleleCount = alleles.alleleCount(); - - final double[][][] newLikelihoodValues = new double[sampleCount][alleleCount][]; - - @SuppressWarnings("unchecked") - final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; - final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; - - for (int s = 0; s < sampleCount; s++) { - newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); - for (int a = 0; a < alleleCount; a++) - newLikelihoodValues[s][a] = valuesBySampleIndex[s][a].clone(); - } - - // Finally we create the new read-likelihood - return new ReadLikelihoods<>(alleles, samples, - newReadsBySampleIndex, - newReadIndexBySampleIndex, newLikelihoodValues); - } - - // Internally used constructor. - @SuppressWarnings("unchecked") - private ReadLikelihoods(final AlleleList alleles, final SampleList samples, - final GATKSAMRecord[][] readsBySampleIndex, final Object2IntMap[] readIndex, - final double[][][] values) { - this.samples = samples; - this.alleles = alleles; - this.readsBySampleIndex = readsBySampleIndex; - this.valuesBySampleIndex = values; - this.readIndexBySampleIndex = readIndex; - final int sampleCount = samples.sampleCount(); - this.readListBySampleIndex = new List[sampleCount]; - - referenceAlleleIndex = findReferenceAllele(alleles); - sampleMatrices = (Matrix[]) new Matrix[sampleCount]; - } - - // Search for the reference allele, if not found the index is -1. - private int findReferenceAllele(final AlleleList alleles) { - final int alleleCount = alleles.alleleCount(); - for (int i = 0; i < alleleCount; i++) - if (alleles.alleleAt(i).isReference()) - return i; - return -1; - } - - /** - * Returns the index of a sample within the likelihood collection. - * - * @param sample the query sample. - * - * @throws IllegalArgumentException if {@code sample} is {@code null}. - * @return -1 if the allele is not included, 0 or greater otherwise. - */ - public int sampleIndex(final String sample) { - return samples.sampleIndex(sample); - } - - /** - * Number of samples included in the likelihood collection. - * @return 0 or greater. - */ - public int sampleCount() { - return samples.sampleCount(); - } - - /** - * Returns sample name given its index. - * - * @param sampleIndex query index. - * - * @throws IllegalArgumentException if {@code sampleIndex} is negative. - * - * @return never {@code null}. - */ - public String sampleAt(final int sampleIndex) { - return samples.sampleAt(sampleIndex); - } - - /** - * Returns the index of an allele within the likelihood collection. - * - * @param allele the query allele. - * - * @throws IllegalArgumentException if {@code allele} is {@code null}. - * - * @return -1 if the allele is not included, 0 or greater otherwise. - */ - public int alleleIndex(final A allele) { - return alleles.alleleIndex(allele); - } - - /** - * Returns number of alleles in the collection. - * @return 0 or greater. - */ - @SuppressWarnings("unused") - public int alleleCount() { - return alleles.alleleCount(); - } - - /** - * Returns the allele given its index. - * - * @param alleleIndex the allele index. - * - * @throws IllegalArgumentException the allele index is {@code null}. - * - * @return never {@code null}. - */ - public A alleleAt(final int alleleIndex) { - return alleles.alleleAt(alleleIndex); - } - - /** - * Returns the reads that belong to a sample sorted by their index (within that sample). - * - * @param sampleIndex the requested sample. - * @return never {@code null} but perhaps a zero-length array if there is no reads in sample. No element in - * the array will be null. - */ - public List sampleReads(final int sampleIndex) { - checkSampleIndex(sampleIndex); - final List extantList = readListBySampleIndex[sampleIndex]; - if (extantList == null) - return readListBySampleIndex[sampleIndex] = Collections.unmodifiableList(Arrays.asList(readsBySampleIndex[sampleIndex])); - else - return extantList; - } - - /** - * Returns a read vs allele likelihood matrix corresponding to a sample. - * - * @param sampleIndex target sample. - * - * @throws IllegalArgumentException if {@code sampleIndex} is not null. - * - * @return never {@code null} - */ - public Matrix sampleMatrix(final int sampleIndex) { - checkSampleIndex(sampleIndex); - final Matrix extantResult = sampleMatrices[sampleIndex]; - if (extantResult != null) - return extantResult; - else - return sampleMatrices[sampleIndex] = new SampleMatrix(sampleIndex); - } - - /** - * Adjusts likelihoods so that for each read, the best allele likelihood is 0 and caps the minimum likelihood - * of any allele for each read based on the maximum alternative allele likelihood. - * - * @param bestToZero set the best likelihood to 0, others will be subtracted the same amount. - * @param maximumLikelihoodDifferenceCap maximum difference between the best alternative allele likelihood - * and any other likelihood. - * - * @throws IllegalArgumentException if {@code maximumDifferenceWithBestAlternative} is not 0 or less. - */ - public void normalizeLikelihoods(final boolean bestToZero, final double maximumLikelihoodDifferenceCap) { - if (maximumLikelihoodDifferenceCap >= 0.0 || Double.isNaN(maximumLikelihoodDifferenceCap)) - throw new IllegalArgumentException("the minimum reference likelihood fall cannot be positive"); - - if (maximumLikelihoodDifferenceCap == Double.NEGATIVE_INFINITY && !bestToZero) - return; - - final int alleleCount = alleles.alleleCount(); - if (alleleCount == 0) // trivial case there is no alleles. - return; - else if (alleleCount == 1 && !bestToZero) - return; - - for (int s = 0; s < valuesBySampleIndex.length; s++) { - final double[][] sampleValues = valuesBySampleIndex[s]; - final int readCount = readsBySampleIndex[s].length; - for (int r = 0; r < readCount; r++) - normalizeLikelihoodsPerRead(bestToZero, maximumLikelihoodDifferenceCap, sampleValues, s, r); - } - } - - // Does the normalizeLikelihoods job for each read. - private void normalizeLikelihoodsPerRead(final boolean bestToZero, final double maximumBestAltLikelihoodDifference, - final double[][] sampleValues, final int sampleIndex, final int readIndex) { - - final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false); - - final double worstLikelihoodCap = bestAlternativeAllele.likelihood + maximumBestAltLikelihoodDifference; - - final double referenceLikelihood = referenceAlleleIndex == -1 ? Double.NEGATIVE_INFINITY : - sampleValues[referenceAlleleIndex][readIndex]; - - - final double bestAbsoluteLikelihood = Math.max(bestAlternativeAllele.likelihood,referenceLikelihood); - - final int alleleCount = alleles.alleleCount(); - if (bestToZero) { - if (bestAbsoluteLikelihood == Double.NEGATIVE_INFINITY) - for (int a = 0; a < alleleCount; a++) - sampleValues[a][readIndex] = 0; - else if (worstLikelihoodCap != Double.NEGATIVE_INFINITY) - for (int a = 0; a < alleleCount; a++) - sampleValues[a][readIndex] = (sampleValues[a][readIndex] < worstLikelihoodCap ? worstLikelihoodCap : sampleValues[a][readIndex]) - bestAbsoluteLikelihood; - else - for (int a = 0; a < alleleCount; a++) - sampleValues[a][readIndex] -= bestAbsoluteLikelihood; - } else // else if (maximumReferenceLikelihoodFall != Double.NEGATIVE_INFINITY ) { // - // Guarantee to be the case by enclosing code. - for (int a = 0; a < alleleCount; a++) - if (sampleValues[a][readIndex] < worstLikelihoodCap) - sampleValues[a][readIndex] = worstLikelihoodCap; - } - - /** - * Returns the samples in this read-likelihood collection. - *

- * Samples are sorted by their index in the collection. - *

- * - *

- * The returned list is an unmodifiable view on the read-likelihoods sample list. - *

- * - * @return never {@code null}. - */ - public List samples() { - return sampleList == null ? sampleList = SampleListUtils.asList(samples) : sampleList; - - } - - /** - * Returns the samples in this read-likelihood collection. - *

- * Samples are sorted by their index in the collection. - *

- * - *

- * The returned list is an unmodifiable. It will not be updated if the collection - * allele list changes. - *

- * - * @return never {@code null}. - */ - public List
alleles() { - return alleleList == null ? alleleList = AlleleListUtils.asList(alleles) : alleleList; - } - - - /** - * Search the best allele for a read. - * - * @param sampleIndex including sample index. - * @param readIndex target read index. - * - * @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null} - * if non-could be found. - */ - private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) { - final int alleleCount = alleles.alleleCount(); - if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference)) - return new BestAllele(sampleIndex,readIndex,-1,Double.NEGATIVE_INFINITY,Double.NEGATIVE_INFINITY); - - final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; - int bestAlleleIndex = canBeReference || referenceAlleleIndex != 0 ? 0 : 1; - - double bestLikelihood = sampleValues[bestAlleleIndex][readIndex]; - double secondBestLikelihood = Double.NEGATIVE_INFINITY; - for (int a = bestAlleleIndex + 1; a < alleleCount; a++) { - if (!canBeReference && referenceAlleleIndex == a) - continue; - final double candidateLikelihood = sampleValues[a][readIndex]; - if (candidateLikelihood > bestLikelihood) { - bestAlleleIndex = a; - secondBestLikelihood = bestLikelihood; - bestLikelihood = candidateLikelihood; - } else if (candidateLikelihood > secondBestLikelihood) { - secondBestLikelihood = candidateLikelihood; - } - } - return new BestAllele(sampleIndex,readIndex,bestAlleleIndex,bestLikelihood,secondBestLikelihood); - } - - public void changeReads(final Map readRealignments) { - final int sampleCount = samples.sampleCount(); - for (int s = 0; s < sampleCount; s++) { - final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; - final Object2IntMap readIndex = readIndexBySampleIndex[s]; - final int sampleReadCount = sampleReads.length; - for (int r = 0; r < sampleReadCount; r++) { - final GATKSAMRecord read = sampleReads[r]; - final GATKSAMRecord replacement = readRealignments.get(read); - if (replacement == null) - continue; - sampleReads[r] = replacement; - if (readIndex != null) { - readIndex.remove(read); - readIndex.put(replacement, r); - } - } - } - } - - /** - * Add alleles that are missing in the read-likelihoods collection giving all reads a default - * likelihood value. - * @param candidateAlleles the potentially missing alleles. - * @param defaultLikelihood the default read likelihood value for that allele. - * - * @throws IllegalArgumentException if {@code candidateAlleles} is {@code null} or there is more than - * one missing allele that is a reference or there is one but the collection already has - * a reference allele. - */ - public void addMissingAlleles(final Collection candidateAlleles, final double defaultLikelihood) { - if (candidateAlleles == null) - throw new IllegalArgumentException("the candidateAlleles list cannot be null"); - if (candidateAlleles.isEmpty()) - return; - final List allelesToAdd = new ArrayList<>(candidateAlleles.size()); - for (final A allele : candidateAlleles) - if (alleles.alleleIndex(allele) == -1) - allelesToAdd.add(allele); - - if (allelesToAdd.isEmpty()) - return; - - final int oldAlleleCount = alleles.alleleCount(); - final int newAlleleCount = alleles.alleleCount() + allelesToAdd.size(); - - alleleList = null; - int referenceIndex = this.referenceAlleleIndex; - @SuppressWarnings("unchecked") - final A[] newAlleles = (A[]) new Allele[newAlleleCount]; - for (int a = 0; a < oldAlleleCount; a++) - newAlleles[a] = this.alleleAt(a); - int newIndex = oldAlleleCount; - for (final A allele : allelesToAdd) { - if (allele.isReference()) { - if (referenceIndex != -1) - throw new IllegalArgumentException("there cannot be more than one reference allele"); - referenceIndex = newIndex; - } - newAlleles[newIndex++] = allele; - } - - alleles = new IndexedAlleleList<>(newAlleles); - - if (referenceIndex != -1) - referenceAlleleIndex = referenceIndex; - - final int sampleCount = samples.sampleCount(); - for (int s = 0; s < sampleCount; s++) { - final int sampleReadCount = readsBySampleIndex[s].length; - final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s],newAlleleCount); - for (int a = oldAlleleCount; a < newAlleleCount; a++) { - newValuesBySampleIndex[a] = new double[sampleReadCount]; - if (defaultLikelihood != 0.0) - Arrays.fill(newValuesBySampleIndex[a],defaultLikelihood); - } - valuesBySampleIndex[s] = newValuesBySampleIndex; - } - } - - /** - * Likelihood matrix between a set of alleles and reads. - * @param the allele-type. - */ - public interface Matrix extends AlleleList { - - /** - * List of reads in the matrix sorted by their index therein. - * @return never {@code null}. - */ - public List reads(); - - /** - * List of alleles in the matrix sorted by their index in the collection. - * @return never {@code null}. - */ - public List alleles(); - - /** - * Set the likelihood of a read given an allele through their indices. - * - * @param alleleIndex the target allele index. - * @param readIndex the target read index. - * @param value new likelihood value for the target read give the target allele. - * - * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} - * are not valid allele and read indices respectively. - */ - public void set(final int alleleIndex, final int readIndex, final double value); - - /** - * Returns the likelihood of a read given a haplotype. - * - * @param alleleIndex the index of the given haplotype. - * @param readIndex the index of the target read. - * - * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} is not a - * valid allele or read index respectively. - * - * @return the requested likelihood, whatever value was provided using {@link #set(int,int,double) set} - * or 0.0 if none was set. - */ - public double get(final int alleleIndex, final int readIndex); - - /** - * Queries the index of an allele in the matrix. - * - * @param allele the target allele. - * - * @throws IllegalArgumentException if {@code allele} is {@code null}. - * @return -1 if such allele does not exist, otherwise its index which 0 or greater. - */ - @SuppressWarnings("unused") - public int alleleIndex(final A allele); - - /** - * Queries the index of a read in the matrix. - * - * @param read the target read. - * - * @throws IllegalArgumentException if {@code read} is {@code null}. - * - * @return -1 if there is not such a read in the matrix, otherwise its index - * which is 0 or greater. - */ - @SuppressWarnings("unused") - public int readIndex(final GATKSAMRecord read); - - /** - * Number of allele in the matrix. - * @return never negative. - */ - public int alleleCount(); - - /** - * Number of reads in the matrix. - * @return never negative. - */ - public int readCount(); - - /** - * Returns the allele given its index. - * - * @param alleleIndex the target allele index. - * - * @throws IllegalArgumentException if {@code alleleIndex} is not a valid allele index. - * @return never {@code null}. - */ - public A alleleAt(final int alleleIndex); - - /** - * Returns the allele given its index. - * - * @param readIndex the target allele index. - * - * @throws IllegalArgumentException if {@code readIndex} is not a valid read index. - * @return never {@code null}. - */ - public GATKSAMRecord readAt(final int readIndex); - - - /** - * Copies the likelihood of all the reads for a given allele into an array from a particular offset. - * @param alleleIndex the targeted allele - * @param dest the destination array. - * @param offset the copy offset within the destination allele - */ - public void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset); - } - - /** - * Perform marginalization from an allele set to another (smaller one) taking the maximum value - * for each read in the original allele subset. - * - * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original - * alleles that correspond to the new one. - * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and - * the same set of samples and reads as the original. - * - * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, - * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele - * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. - */ - public ReadLikelihoods marginalize(final Map> newToOldAlleleMap) { - - if (newToOldAlleleMap == null) - throw new IllegalArgumentException("the input allele mapping cannot be null"); - - @SuppressWarnings("unchecked") - final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); - final int oldAlleleCount = alleles.alleleCount(); - final int newAlleleCount = newAlleles.length; - - // we get the index correspondence between new old -> new allele, -1 entries mean that the old - // allele does not map to any new; supported but typically not the case. - final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); - - // We calculate the marginal likelihoods. - - final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, null); - - final int sampleCount = samples.sampleCount(); - - @SuppressWarnings("unchecked") - final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; - final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; - - for (int s = 0; s < sampleCount; s++) { - newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); - } - - // Finally we create the new read-likelihood - return new ReadLikelihoods<>(new IndexedAlleleList(newAlleles), samples, - newReadsBySampleIndex, - newReadIndexBySampleIndex, newLikelihoodValues); - } - - - /** - * Perform marginalization from an allele set to another (smaller one) taking the maximum value - * for each read in the original allele subset. - * - * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original - * alleles that correspond to the new one. - * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and - * the same set of samples and reads as the original. - * - * @param overlap if not {@code null}, only reads that overlap the location (with unclipping) will be present in - * the output read-collection. - * - * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, - * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele - * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. - */ - public ReadLikelihoods marginalize(final Map> newToOldAlleleMap, final GenomeLoc overlap) { - - if (overlap == null) - return marginalize(newToOldAlleleMap); - - if (newToOldAlleleMap == null) - throw new IllegalArgumentException("the input allele mapping cannot be null"); - - @SuppressWarnings("unchecked") - final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); - final int oldAlleleCount = alleles.alleleCount(); - final int newAlleleCount = newAlleles.length; - - // we get the index correspondence between new old -> new allele, -1 entries mean that the old - // allele does not map to any new; supported but typically not the case. - final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); - - final int[][] readsToKeep = overlappingReadIndicesBySampleIndex(overlap); - // We calculate the marginal likelihoods. - - final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, readsToKeep); - - final int sampleCount = samples.sampleCount(); - - @SuppressWarnings("unchecked") - final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; - final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; - - for (int s = 0; s < sampleCount; s++) { - final int[] sampleReadsToKeep = readsToKeep[s]; - final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[s]; - final int oldSampleReadCount = oldSampleReads.length; - final int newSampleReadCount = sampleReadsToKeep.length; - if (newSampleReadCount == oldSampleReadCount) { - newReadsBySampleIndex[s] = oldSampleReads.clone(); - } else { - newReadsBySampleIndex[s] = new GATKSAMRecord[newSampleReadCount]; - for (int i = 0; i < newSampleReadCount; i++) - newReadsBySampleIndex[s][i] = oldSampleReads[sampleReadsToKeep[i]]; - } - } - - // Finally we create the new read-likelihood - return new ReadLikelihoods<>(new IndexedAlleleList(newAlleles), samples, - newReadsBySampleIndex, - newReadIndexBySampleIndex, newLikelihoodValues); - } - - private int[][] overlappingReadIndicesBySampleIndex(final GenomeLoc overlap) { - if (overlap == null) - return null; - final int sampleCount = samples.sampleCount(); - final int[][] result = new int[sampleCount][]; - final IntArrayList buffer = new IntArrayList(200); - final int referenceIndex = overlap.getContigIndex(); - final int overlapStart = overlap.getStart(); - final int overlapEnd = overlap.getStop(); - for (int s = 0; s < sampleCount; s++) { - buffer.clear(); - final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; - final int sampleReadCount = sampleReads.length; - buffer.ensureCapacity(sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) - if (unclippedReadOverlapsRegion(sampleReads[r], referenceIndex, overlapStart, overlapEnd)) - buffer.add(r); - result[s] = buffer.toIntArray(); - } - return result; - } - - public static boolean unclippedReadOverlapsRegion(final GATKSAMRecord read, final GenomeLoc region) { - return unclippedReadOverlapsRegion(read, region.getContigIndex(), region.getStart(), region.getStop()); - } - - private static boolean unclippedReadOverlapsRegion(final GATKSAMRecord sampleRead, final int referenceIndex, final int start, final int end) { - final int readReference = sampleRead.getReferenceIndex(); - if (readReference != referenceIndex) - return false; - - final int readStart = sampleRead.getUnclippedStart(); - if (readStart > end) - return false; - - final int readEnd = sampleRead.getReadUnmappedFlag() ? sampleRead.getUnclippedEnd() - : Math.max(sampleRead.getUnclippedEnd(), sampleRead.getUnclippedStart()); - return readEnd >= start; - } - - // Calculate the marginal likelihoods considering the old -> new allele index mapping. - private double[][][] marginalLikelihoods(final int oldAlleleCount, final int newAlleleCount, final int[] oldToNewAlleleIndexMap, final int[][] readsToKeep) { - - final int sampleCount = samples.sampleCount(); - final double[][][] result = new double[sampleCount][][]; - - for (int s = 0; s < sampleCount; s++) { - final int sampleReadCount = readsBySampleIndex[s].length; - final double[][] oldSampleValues = valuesBySampleIndex[s]; - final int[] sampleReadToKeep = readsToKeep == null || readsToKeep[s].length == sampleReadCount ? null : readsToKeep[s]; - final int newSampleReadCount = sampleReadToKeep == null ? sampleReadCount : sampleReadToKeep.length; - final double[][] newSampleValues = result[s] = new double[newAlleleCount][newSampleReadCount]; - // We initiate all likelihoods to -Inf. - for (int a = 0; a < newAlleleCount; a++) - Arrays.fill(newSampleValues[a], Double.NEGATIVE_INFINITY); - // For each old allele and read we update the new table keeping the maximum likelihood. - for (int r = 0; r < newSampleReadCount; r++) { - for (int a = 0; a < oldAlleleCount; a++) { - final int oldReadIndex = newSampleReadCount == sampleReadCount ? r : sampleReadToKeep[r]; - final int newAlleleIndex = oldToNewAlleleIndexMap[a]; - if (newAlleleIndex == -1) - continue; - final double likelihood = oldSampleValues[a][oldReadIndex]; - if (likelihood > newSampleValues[newAlleleIndex][r]) - newSampleValues[newAlleleIndex][r] = likelihood; - } - } - } - return result; - } - - /** - * Given a collection of likelihood in the old map format, it creates the corresponding read-likelihoods collection. - * - * @param map the likelihoods to transform. - * - * @throws IllegalArgumentException if {@code map} is {@code null}. - * - * @return never {@code null}. - */ - public static ReadLikelihoods fromPerAlleleReadLikelihoodsMap(final Map map) { - - // First we need to create the read-likelihood collection with all required alleles, samples and reads. - final SampleList sampleList = new IndexedSampleList(map.keySet()); - final Set alleles = new LinkedHashSet<>(10); - final Map> sampleToReads = new HashMap<>(sampleList.sampleCount()); - for (final Map.Entry entry : map.entrySet()) { - final String sample = entry.getKey(); - final PerReadAlleleLikelihoodMap sampleLikelihoods = entry.getValue(); - alleles.addAll(sampleLikelihoods.getAllelesSet()); - sampleToReads.put(sample,new ArrayList<>(sampleLikelihoods.getLikelihoodReadMap().keySet())); - } - - final AlleleList alleleList = new IndexedAlleleList<>(alleles); - final ReadLikelihoods result = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); - - // Now set the likelihoods. - for (final Map.Entry sampleEntry : map.entrySet()) { - final ReadLikelihoods.Matrix sampleMatrix = result.sampleMatrix(result.sampleIndex(sampleEntry.getKey())); - for (final Map.Entry> readEntry : sampleEntry.getValue().getLikelihoodReadMap().entrySet()) { - final GATKSAMRecord read = readEntry.getKey(); - final int readIndex = sampleMatrix.readIndex(read); - for (final Map.Entry alleleEntry : readEntry.getValue().entrySet()) { - final int alleleIndex = result.alleleIndex(alleleEntry.getKey()); - sampleMatrix.set(alleleIndex,readIndex,alleleEntry.getValue()); - } - } - } - return result; - } - - // calculates an old to new allele index map array. - private int[] oldToNewAlleleIndexMap(final Map> newToOldAlleleMap, final B[] newAlleles, - final int oldAlleleCount, final int newAlleleCount) { - - final int[] oldToNewAlleleIndexMap = new int[oldAlleleCount]; - Arrays.fill(oldToNewAlleleIndexMap, -1); // -1 indicate that there is no new allele that make reference to that old one. - - for (int i = 0; i < newAlleleCount; i++) { - final B newAllele = newAlleles[i]; - if (newAllele == null) - throw new IllegalArgumentException("input alleles cannot be null"); - final List oldAlleles = newToOldAlleleMap.get(newAllele); - if (oldAlleles == null) - throw new IllegalArgumentException("no new allele list can be null"); - for (final A oldAllele : oldAlleles) { - if (oldAllele == null) - throw new IllegalArgumentException("old alleles cannot be null"); - final int oldAlleleIndex = alleleIndex(oldAllele); - if (oldAlleleIndex == -1) - throw new IllegalArgumentException("missing old allele " + oldAllele + " in likelihood collection "); - if (oldToNewAlleleIndexMap[oldAlleleIndex] != -1) - throw new IllegalArgumentException("collision: two new alleles make reference to the same old allele"); - oldToNewAlleleIndexMap[oldAlleleIndex] = i; - } - } - return oldToNewAlleleIndexMap; - } - - /** - * Remove those reads that do not overlap certain genomic location. - * - *

- * This method modifies the current read-likelihoods collection. - *

- * - * @param location the target location. - * - * @throws IllegalArgumentException the location cannot be {@code null} nor unmapped. - */ - @SuppressWarnings("unused") - public void filterToOnlyOverlappingUnclippedReads(final GenomeLoc location) { - if (location == null) - throw new IllegalArgumentException("the location cannot be null"); - if (location.isUnmapped()) - throw new IllegalArgumentException("the location cannot be unmapped"); - - final int sampleCount = samples.sampleCount(); - - final int locContig = location.getContigIndex(); - final int locStart = location.getStart(); - final int locEnd = location.getStop(); - - final int alleleCount = alleles.alleleCount(); - final IntArrayList removeIndices = new IntArrayList(10); - for (int s = 0; s < sampleCount; s++) { - int readRemoveCount = 0; - final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; - final int sampleReadCount = sampleReads.length; - for (int r = 0; r < sampleReadCount; r++) - if (!unclippedReadOverlapsRegion(sampleReads[r], locContig, locStart, locEnd)) - removeIndices.add(r); - removeSampleReads(s,removeIndices,alleleCount); - removeIndices.clear(); - } - } - - // Compare the read coordinates to the location of interest. - private boolean readOverlapsLocation(final String contig, final int locStart, - final int locEnd, final GATKSAMRecord read) { - final boolean overlaps; - - if (read.getReadUnmappedFlag()) - overlaps = false; - else if (!read.getReferenceName().equals(contig)) - overlaps = false; - else { - int alnStart = read.getAlignmentStart(); - int alnStop = read.getAlignmentEnd(); - if (alnStart > alnStop) { // Paranoia? based on GLP.createGenomeLoc(Read) this can happen?. - final int end = alnStart; - alnStart = alnStop; - alnStop = end; - } - overlaps = !(alnStop < locStart || alnStart > locEnd); - } - return overlaps; - } - - /** - * Removes those read that the best possible likelihood given any allele is just too low. - * - *

- * This is determined by a maximum error per read-base against the best likelihood possible. - *

- * - * @param maximumErrorPerBase the minimum acceptable error rate per read base, must be - * a positive number. - * - * @throws IllegalStateException is not supported for read-likelihood that do not contain alleles. - * - * @throws IllegalArgumentException if {@code maximumErrorPerBase} is negative. - */ - public void filterPoorlyModeledReads(final double maximumErrorPerBase) { - if (alleles.alleleCount() == 0) - throw new IllegalStateException("unsupported for read-likelihood collections with no alleles"); - if (Double.isNaN(maximumErrorPerBase) || maximumErrorPerBase <= 0.0) - throw new IllegalArgumentException("the maximum error per base must be a positive number"); - final int sampleCount = samples.sampleCount(); - - final int alleleCount = alleles.alleleCount(); - final IntArrayList removeIndices = new IntArrayList(10); - for (int s = 0; s < sampleCount; s++) { - final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; - final int sampleReadCount = sampleReads.length; - for (int r = 0; r < sampleReadCount; r++) { - final GATKSAMRecord read = sampleReads[r]; - if (readIsPoorlyModelled(s,r,read, maximumErrorPerBase)) - removeIndices.add(r); - } - removeSampleReads(s, removeIndices, alleleCount); - removeIndices.clear(); - } - } - - // Check whether the read is poorly modelled. - protected boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) { - final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); - final double log10QualPerBase = -4.0; - final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; - - final int alleleCount = alleles.alleleCount(); - final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; - for (int a = 0; a < alleleCount; a++) - if (sampleValues[a][readIndex] >= log10MaxLikelihoodForTrueAllele) - return false; - return true; - } - - - /** - * Add more reads to the collection. - * - * @param readsBySample reads to add. - * @param initialLikelihood the likelihood for the new entries. - * - * @throws IllegalArgumentException if {@code readsBySample} is {@code null} or {@code readsBySample} contains - * {@code null} reads, or {@code readsBySample} contains read that are already present in the read-likelihood - * collection. - */ - public void addReads(final Map> readsBySample, final double initialLikelihood) { - - for (final Map.Entry> entry : readsBySample.entrySet()) { - - final String sample = entry.getKey(); - final List newSampleReads = entry.getValue(); - final int sampleIndex = samples.sampleIndex(sample); - - if (sampleIndex == -1) - throw new IllegalArgumentException("input sample " + sample + - " is not part of the read-likelihoods collection"); - - if (newSampleReads == null || newSampleReads.size() == 0) - continue; - - final int sampleReadCount = readsBySampleIndex[sampleIndex].length; - final int newSampleReadCount = sampleReadCount + newSampleReads.size(); - - appendReads(newSampleReads, sampleIndex, sampleReadCount, newSampleReadCount); - extendsLikelihoodArrays(initialLikelihood, sampleIndex, sampleReadCount, newSampleReadCount); - } - } - - // Extends the likelihood arrays-matrices. - private void extendsLikelihoodArrays(double initialLikelihood, int sampleIndex, int sampleReadCount, int newSampleReadCount) { - final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; - final int alleleCount = alleles.alleleCount(); - for (int a = 0; a < alleleCount; a++) - sampleValues[a] = Arrays.copyOf(sampleValues[a], newSampleReadCount); - if (initialLikelihood != 0.0) // the default array new value. - for (int a = 0; a < alleleCount; a++) - Arrays.fill(sampleValues[a],sampleReadCount,newSampleReadCount,initialLikelihood); - } - - // Append the new read reference into the structure per-sample. - private void appendReads(final List newSampleReads, final int sampleIndex, - final int sampleReadCount, final int newSampleReadCount) { - final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex] = - Arrays.copyOf(readsBySampleIndex[sampleIndex], newSampleReadCount); - - int nextReadIndex = sampleReadCount; - final Object2IntMap sampleReadIndex = readIndexBySampleIndex[sampleIndex]; - for (final GATKSAMRecord newRead : newSampleReads) { - // if (sampleReadIndex.containsKey(newRead)) // might be worth handle this without exception (ignore the read?) but in practice should never be the case. - // throw new IllegalArgumentException("you cannot add reads that are already in read-likelihood collection"); - if (sampleReadIndex != null ) sampleReadIndex.put(newRead,nextReadIndex); - sampleReads[nextReadIndex++] = newRead; - } - } - - /** - * Adds the non-reference allele to the read-likelihood collection setting each read likelihood to the second - * best found (or best one if only one allele has likelihood). - * - *

Nothing will happen if the read-likelihoods collection already includes the non-ref allele

- * - *

- * Implementation note: even when strictly speaking we do not need to demand the calling code to pass - * the reference the non-ref allele, we still demand it in order to lead the - * the calling code to use the right generic type for this likelihoods - * collection {@link Allele}. - *

- * - * @param nonRefAllele the non-ref allele. - * - * @throws IllegalArgumentException if {@code nonRefAllele} is anything but the designated <NON_REF> - * symbolic allele {@link GATKVariantContextUtils#NON_REF_SYMBOLIC_ALLELE}. - */ - public void addNonReferenceAllele(final A nonRefAllele) { - - if (nonRefAllele == null) - throw new IllegalArgumentException("non-ref allele cannot be null"); - if (!nonRefAllele.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)) - throw new IllegalArgumentException("the non-ref allele is not valid"); - // Already present? - if (alleles.alleleIndex(nonRefAllele) != -1) - return; - - final int oldAlleleCount = alleles.alleleCount(); - final int newAlleleCount = oldAlleleCount + 1; - @SuppressWarnings("unchecked") - final A[] newAlleles = (A[]) new Allele[newAlleleCount]; - for (int a = 0; a < oldAlleleCount; a++) - newAlleles[a] = alleles.alleleAt(a); - newAlleles[oldAlleleCount] = nonRefAllele; - alleles = new IndexedAlleleList<>(newAlleles); - alleleList = null; // remove the cached alleleList. - - final int sampleCount = samples.sampleCount(); - for (int s = 0; s < sampleCount; s++) - addNonReferenceAlleleLikelihoodsPerSample(oldAlleleCount, newAlleleCount, s); - } - - // Updates per-sample structures according to the addition of the NON_REF allele. - private void addNonReferenceAlleleLikelihoodsPerSample(final int alleleCount, final int newAlleleCount, final int sampleIndex) { - final double[][] sampleValues = valuesBySampleIndex[sampleIndex] = Arrays.copyOf(valuesBySampleIndex[sampleIndex], newAlleleCount); - final int sampleReadCount = readsBySampleIndex[sampleIndex].length; - - final double[] nonRefAlleleLikelihoods = sampleValues[alleleCount] = new double [sampleReadCount]; - Arrays.fill(nonRefAlleleLikelihoods,Double.NEGATIVE_INFINITY); - for (int r = 0; r < sampleReadCount; r++) { - final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); - final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood - : bestAllele.likelihood - bestAllele.confidence; - nonRefAlleleLikelihoods[r] = secondBestLikelihood; - } - } - - /** - * Downsamples reads based on contamination fractions making sure that all alleles are affected proportionally. - * - * @param perSampleDownsamplingFraction contamination sample map where the sample name are the keys and the - * fractions are the values. - * - * @throws IllegalArgumentException if {@code perSampleDownsamplingFraction} is {@code null}. - */ - public void contaminationDownsampling(final Map perSampleDownsamplingFraction) { - - final int sampleCount = samples.sampleCount(); - final IntArrayList readsToRemove = new IntArrayList(10); // blind estimate, can be improved? - final int alleleCount = alleles.alleleCount(); - for (int s = 0; s < sampleCount; s++) { - final String sample = samples.sampleAt(s); - final Double fractionDouble = perSampleDownsamplingFraction.get(sample); - if (fractionDouble == null) - continue; - final double fraction = fractionDouble; - if (Double.isNaN(fraction) || fraction <= 0.0) - continue; - if (fraction >= 1.0) { - final int sampleReadCount = readsBySampleIndex[s].length; - readsToRemove.ensureCapacity(sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) - readsToRemove.add(r); - removeSampleReads(s,readsToRemove,alleleCount); - readsToRemove.clear(); - } - else { - final Map> readsByBestAllelesMap = readsByBestAlleleMap(s); - removeSampleReads(s,AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(readsByBestAllelesMap, fraction),alleleCount); - } - } - } - - /** - * Given a collection of likelihood in the old map format, it creates the corresponding read-likelihoods collection. - * - * @param alleleList the target list of alleles. - * @param map the likelihoods to transform. - * - * - * @throws IllegalArgumentException if {@code map} is {@code null}, or {@code map} does not contain likelihoods for all read vs allele combinations. - * - * @return never {@code null}. - */ - public static ReadLikelihoods fromPerAlleleReadLikelihoodsMap(final AlleleList alleleList, final Map map) { - - //TODO add test code for this method. - // First we need to create the read-likelihood collection with all required alleles, samples and reads. - final SampleList sampleList = new IndexedSampleList(map.keySet()); - final int alleleCount = alleleList.alleleCount(); - final Map> sampleToReads = new HashMap<>(sampleList.sampleCount()); - for (final Map.Entry entry : map.entrySet()) { - final String sample = entry.getKey(); - final PerReadAlleleLikelihoodMap sampleLikelihoods = entry.getValue(); - sampleToReads.put(sample,new ArrayList<>(sampleLikelihoods.getLikelihoodReadMap().keySet())); - } - - final ReadLikelihoods result = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); - - // Now set the likelihoods. - for (final Map.Entry sampleEntry : map.entrySet()) { - final ReadLikelihoods.Matrix sampleMatrix = result.sampleMatrix(result.sampleIndex(sampleEntry.getKey())); - for (final Map.Entry> readEntry : sampleEntry.getValue().getLikelihoodReadMap().entrySet()) { - final GATKSAMRecord read = readEntry.getKey(); - final int readIndex = sampleMatrix.readIndex(read); - final Map alleleToLikelihoodMap = readEntry.getValue(); - for (int a = 0; a < alleleCount; a++) { - final Allele allele = alleleList.alleleAt(a); - final Double likelihood = alleleToLikelihoodMap.get(allele); - if (likelihood == null) - throw new IllegalArgumentException("there is no likelihood for allele " + allele + " and read " + read); - sampleMatrix.set(a,readIndex,likelihood); - } - } - } - return result; - } - - /** - * Returns the collection of best allele estimates for the reads based on the read-likelihoods. - * - * @throws IllegalStateException if there is no alleles. - * - * @return never {@code null}, one element per read in the read-likelihoods collection. - */ - public Collection bestAlleles() { - final List result = new ArrayList<>(100); // blind estimate. - final int sampleCount = samples.sampleCount(); - for (int s = 0; s < sampleCount; s++) { - final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; - final int readCount = sampleReads.length; - for (int r = 0; r < readCount; r++) - result.add(searchBestAllele(s,r,true)); - } - return result; - } - - /** - * Returns reads stratified by their best allele. - * @param sampleIndex the target sample. - * @return never {@code null}, perhaps empty. - */ - public Map> readsByBestAlleleMap(final int sampleIndex) { - checkSampleIndex(sampleIndex); - final int alleleCount = alleles.alleleCount(); - final int sampleReadCount = readsBySampleIndex[sampleIndex].length; - final Map> result = new HashMap<>(alleleCount); - for (int a = 0; a < alleleCount; a++) - result.put(alleles.alleleAt(a),new ArrayList(sampleReadCount)); - readsByBestAlleleMap(sampleIndex,result); - return result; - } - - /** - * Returns reads stratified by their best allele. - * @return never {@code null}, perhaps empty. - */ - @SuppressWarnings("unused") - public Map> readsByBestAlleleMap() { - final int alleleCount = alleles.alleleCount(); - final Map> result = new HashMap<>(alleleCount); - final int totalReadCount = readCount(); - for (int a = 0; a < alleleCount; a++) - result.put(alleles.alleleAt(a),new ArrayList(totalReadCount)); - final int sampleCount = samples.sampleCount(); - for (int s = 0; s < sampleCount; s++) - readsByBestAlleleMap(s,result); - return result; - } - - private void readsByBestAlleleMap(final int sampleIndex, final Map> result) { - final GATKSAMRecord[] reads = readsBySampleIndex[sampleIndex]; - final int readCount = reads.length; - - for (int r = 0; r < readCount; r++) { - final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); - if (!bestAllele.isInformative()) - continue; - result.get(bestAllele.allele).add(bestAllele.read); - } - } - - /** - * Returns the index of a read within a sample read-likelihood sub collection. - * @param sampleIndex the sample index. - * @param read the query read. - * @return -1 if there is no such read in that sample, 0 or greater otherwise. - */ - @SuppressWarnings("unused") - public int readIndex(final int sampleIndex, final GATKSAMRecord read) { - final Object2IntMap readIndex = readIndexBySampleIndex(sampleIndex); - if (readIndex.containsKey(read)) - return readIndexBySampleIndex(sampleIndex).getInt(read); - else - return -1; - } - - /** - * Returns the total number of reads in the read-likelihood collection. - * - * @return never {@code null} - */ - public int readCount() { - int sum = 0; - final int sampleCount = samples.sampleCount(); - for (int i = 0; i < sampleCount; i++) - sum += readsBySampleIndex[i].length; - return sum; - } - - /** - * Returns the number of reads that belong to a sample in the read-likelihood collection. - * @param sampleIndex the query sample index. - * - * @throws IllegalArgumentException if {@code sampleIndex} is not a valid sample index. - * @return 0 or greater. - */ - public int sampleReadCount(int sampleIndex) { - checkSampleIndex(sampleIndex); - return readsBySampleIndex[sampleIndex].length; - } - - /** - * Contains information about the best allele for a read search result. - */ - public class BestAllele { - public static final double INFORMATIVE_THRESHOLD = 0.2; - - /** - * Null if there is no possible match (no allele?). - */ - public final A allele; - - /** - * The containing sample. - */ - public final String sample; - - /** - * The query read. - */ - public final GATKSAMRecord read; - - /** - * If allele != null, the indicates the likelihood of the read. - */ - public final double likelihood; - - /** - * Confidence that the read actually was generated under that likelihood. - * This is equal to the difference between this and the second best allele match. - */ - public final double confidence; - - private BestAllele(final int sampleIndex, final int readIndex, final int bestAlleleIndex, - final double likelihood, final double secondBestLikelihood) { - allele = bestAlleleIndex == -1 ? null : alleles.alleleAt(bestAlleleIndex); - this.likelihood = likelihood; - sample = samples.sampleAt(sampleIndex); - read = readsBySampleIndex[sampleIndex][readIndex]; - confidence = likelihood == secondBestLikelihood ? 0 : likelihood - secondBestLikelihood; - } - - public boolean isInformative() { - return confidence > INFORMATIVE_THRESHOLD; - } - } - - private void removeSampleReads(final int sampleIndex, final IntArrayList indexToRemove, final int alleleCount) { - final int removeCount = indexToRemove.size(); - if (removeCount == 0) - return; - - final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; - final int sampleReadCount = sampleReads.length; - - final Object2IntMap indexByRead = readIndexBySampleIndex[sampleIndex]; - if (indexByRead != null) - for (int i = 0; i < removeCount; i++) - indexByRead.remove(sampleReads[indexToRemove.getInt(i)]); - final boolean[] removeIndex = new boolean[sampleReadCount]; - int firstDeleted = indexToRemove.get(0); - for (int i = 0; i < removeCount; i++) - removeIndex[indexToRemove.get(i)] = true; - - final int newSampleReadCount = sampleReadCount - removeCount; - - // Now we skim out the removed reads from the read array. - final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[sampleIndex]; - final GATKSAMRecord[] newSampleReads = new GATKSAMRecord[newSampleReadCount]; - - System.arraycopy(oldSampleReads,0,newSampleReads,0,firstDeleted); - Utils.skimArray(oldSampleReads,firstDeleted, newSampleReads, firstDeleted, removeIndex, firstDeleted); - - // Then we skim out the likelihoods of the removed reads. - final double[][] oldSampleValues = valuesBySampleIndex[sampleIndex]; - final double[][] newSampleValues = new double[alleleCount][newSampleReadCount]; - for (int a = 0; a < alleleCount; a++) { - System.arraycopy(oldSampleValues[a],0,newSampleValues[a],0,firstDeleted); - Utils.skimArray(oldSampleValues[a], firstDeleted, newSampleValues[a], firstDeleted, removeIndex, firstDeleted); - } - valuesBySampleIndex[sampleIndex] = newSampleValues; - readsBySampleIndex[sampleIndex] = newSampleReads; - readListBySampleIndex[sampleIndex] = null; // reset the unmodifiable list. - } - - - // Requires that the collection passed iterator can remove elements, and it can be modified. - private void removeSampleReads(final int sampleIndex, final Collection readsToRemove, final int alleleCount) { - final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; - final int sampleReadCount = sampleReads.length; - - final Object2IntMap indexByRead = readIndexBySampleIndex(sampleIndex); - // Count how many we are going to remove, which ones (indexes) and remove entry from the read-index map. - final boolean[] removeIndex = new boolean[sampleReadCount]; - int removeCount = 0; // captures the number of deletions. - int firstDeleted = sampleReadCount; // captures the first position that was deleted. - - final Iterator readsToRemoveIterator = readsToRemove.iterator(); - while (readsToRemoveIterator.hasNext()) { - final GATKSAMRecord read = readsToRemoveIterator.next(); - if (indexByRead.containsKey(read)) { - final int index = indexByRead.getInt(read); - if (firstDeleted > index) - firstDeleted = index; - removeCount++; - removeIndex[index] = true; - readsToRemoveIterator.remove(); - indexByRead.remove(read); - } - } - - // Nothing to remove we just finish here. - if (removeCount == 0) - return; - - final int newSampleReadCount = sampleReadCount - removeCount; - - // Now we skim out the removed reads from the read array. - final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[sampleIndex]; - final GATKSAMRecord[] newSampleReads = new GATKSAMRecord[newSampleReadCount]; - - System.arraycopy(oldSampleReads,0,newSampleReads,0,firstDeleted); - Utils.skimArray(oldSampleReads,firstDeleted, newSampleReads, firstDeleted, removeIndex, firstDeleted); - - // Update the indices for the extant reads from the first deletion onwards. - for (int r = firstDeleted; r < newSampleReadCount; r++) { - indexByRead.put(newSampleReads[r], r); - } - - // Then we skim out the likelihoods of the removed reads. - final double[][] oldSampleValues = valuesBySampleIndex[sampleIndex]; - final double[][] newSampleValues = new double[alleleCount][newSampleReadCount]; - for (int a = 0; a < alleleCount; a++) { - System.arraycopy(oldSampleValues[a],0,newSampleValues[a],0,firstDeleted); - Utils.skimArray(oldSampleValues[a], firstDeleted, newSampleValues[a], firstDeleted, removeIndex, firstDeleted); - } - valuesBySampleIndex[sampleIndex] = newSampleValues; - readsBySampleIndex[sampleIndex] = newSampleReads; - readListBySampleIndex[sampleIndex] = null; // reset the unmodifiable list. - } - - private Object2IntMap readIndexBySampleIndex(final int sampleIndex) { - if (readIndexBySampleIndex[sampleIndex] == null) { - final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; - final int sampleReadCount = sampleReads.length; - readIndexBySampleIndex[sampleIndex] = new Object2IntOpenHashMap<>(sampleReadCount); - for (int r = 0; r < sampleReadCount; r++) - readIndexBySampleIndex[sampleIndex].put(sampleReads[r],r); - } - return readIndexBySampleIndex[sampleIndex]; - } - - /** - * Transform into a multi-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. - * @return never {@code null}. - * - * @deprecated - * - * This method should eventually disappear once we have removed PerReadAlleleLikelihoodMap class completelly. - */ - @Deprecated - @SuppressWarnings("all") - public Map toPerReadAlleleLikelihoodMap() { - final int sampleCount = samples.sampleCount(); - final Map result = new HashMap<>(sampleCount); - for (int s = 0; s < sampleCount; s++) - result.put(samples.sampleAt(s),toPerReadAlleleLikelihoodMap(s)); - return result; - } - - /** - * Transform into a single-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. - * - * @return never {@code null}. - */ - @Deprecated - public PerReadAlleleLikelihoodMap toPerReadAlleleLikelihoodMap(final int sampleIndex) { - checkSampleIndex(sampleIndex); - final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); - final int alleleCount = alleles.alleleCount(); - final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; - final int sampleReadCount = sampleReads.length; - for (int a = 0; a < alleleCount; a++) { - final A allele = alleles.alleleAt(a); - final double[] readLikelihoods = valuesBySampleIndex[sampleIndex][a]; - for (int r = 0; r < sampleReadCount; r++) - result.add(sampleReads[r], allele, readLikelihoods[r]); - } - return result; - } - - /** - * Implements a likelihood matrix per sample given its index. - */ - private class SampleMatrix implements Matrix
{ - - private final int sampleIndex; - - private SampleMatrix(final int sampleIndex) { - this.sampleIndex = sampleIndex; - } - - @Override - public List reads() { - return sampleReads(sampleIndex); - } - - @Override - public List alleles() { - return ReadLikelihoods.this.alleles(); - } - - @Override - public void set(final int alleleIndex, final int readIndex, final double value) { - valuesBySampleIndex[sampleIndex][alleleIndex][readIndex] = value; - } - - @Override - public double get(final int alleleIndex, final int readIndex) { - return valuesBySampleIndex[sampleIndex][alleleIndex][readIndex]; - } - - @Override - public int alleleIndex(final A allele) { - return ReadLikelihoods.this.alleleIndex(allele); - } - - @Override - public int readIndex(final GATKSAMRecord read) { - return ReadLikelihoods.this.readIndex(sampleIndex, read); - } - - @Override - public int alleleCount() { - return alleles.alleleCount(); - } - - @Override - public int readCount() { - return readsBySampleIndex[sampleIndex].length; - } - - @Override - public A alleleAt(int alleleIndex) { - return ReadLikelihoods.this.alleleAt(alleleIndex); - } - - @Override - public GATKSAMRecord readAt(final int readIndex) { - if (readIndex < 0) - throw new IllegalArgumentException("the read-index cannot be negative"); - final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; - if (readIndex >= sampleReads.length) - throw new IllegalArgumentException("the read-index is beyond the read count of the sample"); - return sampleReads[readIndex]; - } - - @Override - public void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset) { - System.arraycopy(valuesBySampleIndex[sampleIndex][alleleIndex],0,dest,offset,readCount()); - } - } - - /** - * Checks whether the provide sample index is valid. - *

- * If not, it throws an exception. - *

- * @param sampleIndex the target sample index. - * - * @throws IllegalArgumentException if {@code sampleIndex} is invalid, i.e. outside the range [0,{@link #sampleCount}). - */ - private void checkSampleIndex(final int sampleIndex) { - if (sampleIndex < 0 || sampleIndex >= samples.sampleCount()) - throw new IllegalArgumentException("invalid sample index: " + sampleIndex); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java deleted file mode 100644 index e5eee12d2..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java +++ /dev/null @@ -1,423 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.Cigar; -import htsjdk.samtools.CigarElement; -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.sam.AlignmentUtils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; - -import java.util.*; - -/** - * Extract simple VariantContext events from a single haplotype - * - * User: depristo - * Date: 3/27/13 - * Time: 8:35 AM - */ -public class EventMap extends TreeMap { - private final static Logger logger = Logger.getLogger(EventMap.class); - protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3; - private static final int MAX_EVENTS_PER_HAPLOTYPE = 3; - private static final int MAX_INDELS_PER_HAPLOTYPE = 2; - public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - - private final Haplotype haplotype; - private final byte[] ref; - private final GenomeLoc refLoc; - private final String sourceNameToAdd; - - public EventMap(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { - super(); - this.haplotype = haplotype; - this.ref = ref; - this.refLoc = refLoc; - this.sourceNameToAdd = sourceNameToAdd; - - processCigarForInitialEvents(); - } - - /** - * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference - * @param stateForTesting - */ - public EventMap(final Collection stateForTesting) { - haplotype = null; - ref = null; - refLoc = null; - sourceNameToAdd = null; - for ( final VariantContext vc : stateForTesting ) - addVC(vc); - } - - protected void processCigarForInitialEvents() { - final Cigar cigar = haplotype.getCigar(); - final byte[] alignment = haplotype.getBases(); - - int refPos = haplotype.getAlignmentStartHapwrtRef(); - if( refPos < 0 ) { - return; - } // Protection against SW failures - - final List proposedEvents = new ArrayList<>(); - - int alignmentPos = 0; - - for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { - final CigarElement ce = cigar.getCigarElement(cigarIndex); - final int elementLength = ce.getLength(); - switch( ce.getOperator() ) { - case I: - { - if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig - final List insertionAlleles = new ArrayList(); - final int insertionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) ) { - insertionAlleles.add( Allele.create(refByte, true) ); - } - if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { - // if the insertion isn't completely resolved in the haplotype, skip it - // note this used to emit SYMBOLIC_UNASSEMBLED_EVENT_ALLELE but that seems dangerous - } else { - byte[] insertionBases = new byte[]{}; - insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base - insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength)); - if( BaseUtils.isAllRegularBases(insertionBases) ) { - insertionAlleles.add( Allele.create(insertionBases, false) ); - } - } - if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele - proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); - } - } - alignmentPos += elementLength; - break; - } - case S: - { - alignmentPos += elementLength; - break; - } - case D: - { - if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig - final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base - final List deletionAlleles = new ArrayList(); - final int deletionStart = refLoc.getStart() + refPos - 1; - final byte refByte = ref[refPos-1]; - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { - deletionAlleles.add( Allele.create(deletionBases, true) ); - deletionAlleles.add( Allele.create(refByte, false) ); - proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); - } - } - refPos += elementLength; - break; - } - case M: - case EQ: - case X: - { - for( int iii = 0; iii < elementLength; iii++ ) { - final byte refByte = ref[refPos]; - final byte altByte = alignment[alignmentPos]; - if( refByte != altByte ) { // SNP! - if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { - final List snpAlleles = new ArrayList(); - snpAlleles.add( Allele.create( refByte, true ) ); - snpAlleles.add( Allele.create( altByte, false ) ); - proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); - } - } - refPos++; - alignmentPos++; - } - break; - } - case N: - case H: - case P: - default: - throw new ReviewedGATKException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); - } - } - - for ( final VariantContext proposedEvent : proposedEvents ) - addVC(proposedEvent, true); - } - - /** - * Add VariantContext vc to this map, merging events with the same start sites if necessary - * @param vc the variant context to add - */ - protected void addVC(final VariantContext vc) { - addVC(vc, true); - } - - /** - * Add VariantContext vc to this map - * @param vc the variant context to add - * @param merge should we attempt to merge it with an already existing element, or should we throw an error in that case? - */ - protected void addVC(final VariantContext vc, final boolean merge) { - if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); - - if ( containsKey(vc.getStart()) ) { - if ( merge ) { - final VariantContext prev = get(vc.getStart()); - put(vc.getStart(), makeBlock(prev, vc)); - } else { - throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc); - } - } else - put(vc.getStart(), vc); - } - - /** - * Create a block substitution out of two variant contexts that start at the same position - * - * vc1 can be SNP, and vc2 can then be either a insertion or deletion. - * If vc1 is an indel, then vc2 must be the opposite type (vc1 deletion => vc2 must be an insertion) - * - * @param vc1 the first variant context we want to merge - * @param vc2 the second - * @return a block substitution that represents the composite substitution implied by vc1 and vc2 - */ - protected VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) { - if ( vc1.getStart() != vc2.getStart() ) throw new IllegalArgumentException("vc1 and 2 must have the same start but got " + vc1 + " and " + vc2); - if ( ! vc1.isBiallelic() ) throw new IllegalArgumentException("vc1 must be biallelic"); - if ( ! vc1.isSNP() ) { - if ( ! ((vc1.isSimpleDeletion() && vc2.isSimpleInsertion()) || (vc1.isSimpleInsertion() && vc2.isSimpleDeletion()))) - throw new IllegalArgumentException("Can only merge single insertion with deletion (or vice versa) but got " + vc1 + " merging with " + vc2); - } else if ( vc2.isSNP() ) { - throw new IllegalArgumentException("vc1 is " + vc1 + " but vc2 is a SNP, which implies there's been some terrible bug in the cigar " + vc2); - } - - final Allele ref, alt; - final VariantContextBuilder b = new VariantContextBuilder(vc1); - if ( vc1.isSNP() ) { - // we have to repair the first base, so SNP case is special cased - if ( vc1.getReference().equals(vc2.getReference()) ) { - // we've got an insertion, so we just update the alt to have the prev alt - ref = vc1.getReference(); - alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false); - } else { - // we're dealing with a deletion, so we patch the ref - ref = vc2.getReference(); - alt = vc1.getAlternateAllele(0); - b.stop(vc2.getEnd()); - } - } else { - final VariantContext insertion = vc1.isSimpleInsertion() ? vc1 : vc2; - final VariantContext deletion = vc1.isSimpleInsertion() ? vc2 : vc1; - ref = deletion.getReference(); - alt = insertion.getAlternateAllele(0); - b.stop(deletion.getEnd()); - } - - return b.alleles(Arrays.asList(ref, alt)).make(); - } - - // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it - @Requires("getNumberOfEvents() > 0") - protected void replaceClumpedEventsWithBlockSubstitutions() { - if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { - int lastStart = -1; - for ( boolean foundOne = true; foundOne; ) { - foundOne = false; - for ( final VariantContext vc : getVariantContexts() ) { - if ( vc.getStart() > lastStart ) { - lastStart = vc.getStart(); - final List neighborhood = getNeighborhood(vc, 10); - if ( updateToBlockSubstitutionIfBetter(neighborhood) ) { - foundOne = true; - break; - } - } - } - } - } - } - - protected boolean updateToBlockSubstitutionIfBetter(final List neighbors) { - if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) - return false; - // TODO -- need more tests to decide if this is really so good - - final VariantContext first = neighbors.get(0); - final int refStartOffset = first.getStart() - refLoc.getStart(); - final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart(); - - final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1); - final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar()); - - final VariantContextBuilder builder = new VariantContextBuilder(first); - builder.stop(first.getStart() + refBases.length - 1); - builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases))); - final VariantContext block = builder.make(); - - // remove all merged events - for ( final VariantContext merged : neighbors ) { - if ( remove(merged.getStart()) == null ) - throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged); - } - - // note must be after we remove the previous events as the treeset only allows one key per start - logger.info("Transforming into block substitution at " + block); - addVC(block, false); - - return true; - } - - /** - * Get all of the variant contexts starting at leftMost that are within maxBP of each other - * - * @param leftMost the left most (smallest position) variant context that will start the neighborhood - * @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next - * to be included the the resulting list - * @return a list that contains at least one element (leftMost) - */ - @Requires({"leftMost != null", "maxBPBetweenEvents >= 0"}) - @Ensures({"result != null", "! result.isEmpty()"}) - protected List getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) { - final List neighbors = new LinkedList(); - - VariantContext left = leftMost; - for ( final VariantContext vc : getVariantContexts() ) { - if ( vc.getStart() < leftMost.getStart() ) - continue; - - if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) { - // this vc is within max distance to the end of the left event, so accumulate it - neighbors.add(vc); - left = vc; - } - } - - return neighbors; - } - - /** - * Get the starting positions of events in this event map - * @return - */ - public Set getStartPositions() { - return keySet(); - } - - /** - * Get the variant contexts in order of start position in this event map - * @return - */ - public Collection getVariantContexts() { - return values(); - } - - /** - * How many events do we have? - * @return - */ - public int getNumberOfEvents() { - return size(); - } - - @Override - public String toString() { - final StringBuilder b = new StringBuilder("EventMap{"); - for ( final VariantContext vc : getVariantContexts() ) - b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles())); - b.append("}"); - return b.toString(); - } - - /** - * Build event maps for each haplotype, returning the sorted set of all of the starting positions of all - * events across all haplotypes - * - * @param haplotypes a list of haplotypes - * @param ref the reference bases - * @param refLoc the span of the reference bases - * @param debug if true, we'll emit debugging information during this operation - * @return a sorted set of start positions of all events among all haplotypes - */ - public static TreeSet buildEventMapsForHaplotypes( final List haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final boolean debug) { - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int hapNumber = 0; - - if( debug ) logger.info("=== Best Haplotypes ==="); - for( final Haplotype h : haplotypes ) { - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( new EventMap( h, ref, refLoc, "HC" + hapNumber++ ) ); - startPosKeySet.addAll(h.getEventMap().getStartPositions()); - - if( debug ) { - logger.info(h.toString()); - logger.info("> Cigar = " + h.getCigar()); - logger.info(">> Events = " + h.getEventMap()); - } - } - - return startPosKeySet; - } - - private static class VariantContextComparator implements Comparator { - @Override - public int compare(VariantContext vc1, VariantContext vc2) { - return vc1.getStart() - vc2.getStart(); - } - } - - /** - * Get all of the VariantContexts in the event maps for all haplotypes, sorted by their start position - * @param haplotypes the set of haplotypes to grab the VCs from - * @return a sorted set of variant contexts - */ - public static TreeSet getAllVariantContexts( final List haplotypes ) { - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet vcs = new TreeSet(new VariantContextComparator()); - - for( final Haplotype h : haplotypes ) { - vcs.addAll(h.getEventMap().getVariantContexts()); - } - - return vcs; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java deleted file mode 100644 index 7b31b2a7b..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java +++ /dev/null @@ -1,343 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.haplotype; - -import com.google.java.contract.Requires; -import htsjdk.samtools.Cigar; -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.CigarOperator; -import org.apache.commons.lang.ArrayUtils; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.sam.AlignmentUtils; -import org.broadinstitute.gatk.utils.sam.ReadUtils; -import htsjdk.variant.variantcontext.Allele; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.LinkedHashMap; -import java.util.List; - -public class Haplotype extends Allele { - - - private GenomeLoc genomeLocation = null; - private EventMap eventMap = null; - private Cigar cigar; - private int alignmentStartHapwrtRef; - private double score = Double.NaN; - - /** - * Main constructor - * - * @param bases a non-null array of bases - * @param isRef is this the reference haplotype? - */ - public Haplotype( final byte[] bases, final boolean isRef ) { - super(bases.clone(), isRef); - } - - /** - * Create a new non-ref haplotype - * - * @param bases a non-null array of bases - */ - public Haplotype( final byte[] bases ) { - this(bases, false); - } - - /** - * Create a new haplotype with bases - * - * Requires bases.length == cigar.getReadLength() - * - * @param bases a non-null array of bases - * @param isRef is this the reference haplotype? - * @param alignmentStartHapwrtRef offset of this haplotype w.r.t. the reference - * @param cigar the cigar that maps this haplotype to the reference sequence - */ - public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) { - this(bases, isRef); - this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; - setCigar(cigar); - } - - /** - * Copy constructor. Note the ref state of the provided allele is ignored! - * - * @param allele allele to copy - */ - public Haplotype( final Allele allele ) { - super(allele, true); - } - - public Haplotype( final byte[] bases, final GenomeLoc loc ) { - this(bases, false); - this.genomeLocation = loc; - } - - /** - * Create a new Haplotype derived from this one that exactly spans the provided location - * - * Note that this haplotype must have a contain a genome loc for this operation to be successful. If no - * GenomeLoc is contained than @throws an IllegalStateException - * - * Also loc must be fully contained within this Haplotype's genomeLoc. If not an IllegalArgumentException is - * thrown. - * - * @param loc a location completely contained within this Haplotype's location - * @return a new Haplotype within only the bases spanning the provided location, or null for some reason the haplotype would be malformed if - */ - public Haplotype trim(final GenomeLoc loc) { - if ( loc == null ) throw new IllegalArgumentException("Loc cannot be null"); - if ( genomeLocation == null ) throw new IllegalStateException("Cannot trim a Haplotype without containing GenomeLoc"); - if ( ! genomeLocation.containsP(loc) ) throw new IllegalArgumentException("Can only trim a Haplotype to a containing span. My loc is " + genomeLocation + " but wanted trim to " + loc); - if ( getCigar() == null ) throw new IllegalArgumentException("Cannot trim haplotype without a cigar " + this); - - final int newStart = loc.getStart() - this.genomeLocation.getStart(); - final int newStop = newStart + loc.size() - 1; - final byte[] newBases = AlignmentUtils.getBasesCoveringRefInterval(newStart, newStop, getBases(), 0, getCigar()); - final Cigar newCigar = AlignmentUtils.trimCigarByReference(getCigar(), newStart, newStop); - - if ( newBases == null || AlignmentUtils.startsOrEndsWithInsertionOrDeletion(newCigar) ) - // we cannot meaningfully chop down the haplotype, so return null - return null; - - final Haplotype ret = new Haplotype(newBases, isReference()); - ret.setCigar(newCigar); - ret.setGenomeLocation(loc); - ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef()); - return ret; - } - - @Override - public boolean equals( Object h ) { - return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); - } - - @Override - public int hashCode() { - return Arrays.hashCode(getBases()); - } - - public EventMap getEventMap() { - return eventMap; - } - - public void setEventMap( final EventMap eventMap ) { - this.eventMap = eventMap; - } - - @Override - public String toString() { - return getDisplayString(); - } - - /** - * Get the span of this haplotype (may be null) - * @return a potentially null genome loc - */ - public GenomeLoc getGenomeLocation() { - return genomeLocation; - } - - public void setGenomeLocation(GenomeLoc genomeLocation) { - this.genomeLocation = genomeLocation; - } - - public long getStartPosition() { - return genomeLocation.getStart(); - } - - public long getStopPosition() { - return genomeLocation.getStop(); - } - - public int getAlignmentStartHapwrtRef() { - return alignmentStartHapwrtRef; - } - - public void setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef ) { - this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; - } - - /** - * Get the cigar for this haplotype. Note that the cigar is guaranteed to be consolidated - * in that multiple adjacent equal operates will have been merged - * @return the cigar of this haplotype - */ - public Cigar getCigar() { - return cigar; - } - - /** - * Get the haplotype cigar extended by padSize M at the tail, consolidated into a clean cigar - * - * @param padSize how many additional Ms should be appended to the end of this cigar. Must be >= 0 - * @return a newly allocated Cigar that consolidate(getCigar + padSize + M) - */ - public Cigar getConsolidatedPaddedCigar(final int padSize) { - if ( padSize < 0 ) throw new IllegalArgumentException("padSize must be >= 0 but got " + padSize); - final Cigar extendedHaplotypeCigar = new Cigar(getCigar().getCigarElements()); - if ( padSize > 0 ) extendedHaplotypeCigar.add(new CigarElement(padSize, CigarOperator.M)); - return AlignmentUtils.consolidateCigar(extendedHaplotypeCigar); - } - - /** - * Set the cigar of this haplotype to cigar. - * - * Note that this function consolidates the cigar, so that 1M1M1I1M1M => 2M1I2M - * - * @param cigar a cigar whose readLength == length() - */ - public void setCigar( final Cigar cigar ) { - this.cigar = AlignmentUtils.consolidateCigar(cigar); - if ( this.cigar.getReadLength() != length() ) - throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength() + " " + this.cigar); - } - - @Requires({"refInsertLocation >= 0"}) - public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { - // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates - final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); - final byte[] myBases = this.getBases(); - if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= myBases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype - return null; - } - - byte[] newHaplotypeBases = new byte[]{}; - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, 0, haplotypeInsertLocation)); // bases before the variant - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant - newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, haplotypeInsertLocation + refAllele.length(), myBases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases); - } - - public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, - final int startPos, - final ReferenceContext ref, - final int haplotypeSize, - final int numPrefBases) { - - LinkedHashMap haplotypeMap = new LinkedHashMap(); - - Allele refAllele = null; - - for (Allele a:alleleList) { - if (a.isReference()) { - refAllele = a; - break; - } - } - - if (refAllele == null) - throw new ReviewedGATKException("BUG: no ref alleles in input to makeHaplotypeListfrom Alleles at loc: "+ startPos); - - final byte[] refBases = ref.getBases(); - - final int startIdxInReference = 1 + startPos - numPrefBases - ref.getWindow().getStart(); - final String basesBeforeVariant = new String(Arrays.copyOfRange(refBases, startIdxInReference, startIdxInReference + numPrefBases)); - - // protect against long events that overrun available reference context - final int startAfter = Math.min(startIdxInReference + numPrefBases + refAllele.getBases().length - 1, refBases.length); - final String basesAfterVariant = new String(Arrays.copyOfRange(refBases, startAfter, refBases.length)); - - // Create location for all haplotypes - final int startLoc = ref.getWindow().getStart() + startIdxInReference; - final int stopLoc = startLoc + haplotypeSize-1; - - final GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); - - for (final Allele a : alleleList) { - - final byte[] alleleBases = a.getBases(); - // use string concatenation - String haplotypeString = basesBeforeVariant + new String(Arrays.copyOfRange(alleleBases, 1, alleleBases.length)) + basesAfterVariant; - haplotypeString = haplotypeString.substring(0,haplotypeSize); - - haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus)); - } - - return haplotypeMap; - } - - private static class Event { - public Allele ref; - public Allele alt; - public int pos; - - public Event( final Allele ref, final Allele alt, final int pos ) { - this.ref = ref; - this.alt = alt; - this.pos = pos; - } - } - - /** - * Get the score (an estimate of the support) of this haplotype - * @return a double, where higher values are better - */ - public double getScore() { - return score; - } - - /** - * Set the score (an estimate of the support) of this haplotype. - * - * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score - * - * @param score a double, where higher values are better - */ - public void setScore(double score) { - this.score = score; - } - - /** - * Comparator used to sort haplotypes, alphanumerically. - * - *

- * If one haplotype is the prefix of the other, the shorter one comes first. - *

- */ - public static final Comparator ALPHANUMERICAL_COMPARATOR = new Comparator() { - - @Override - public int compare(final Haplotype o1, final Haplotype o2) { - if (o1 == o2) - return 0; - final byte[] bases1 = o1.getBases(); - final byte[] bases2 = o2.getBases(); - final int iLimit = Math.min(bases1.length, bases2.length); - for (int i = 0; i < iLimit; i++) { - final int cmp = Byte.compare(bases1[i], bases2[i]); - if (cmp != 0) return cmp; - } - if (bases1.length == bases2.length) return 0; - return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. - } - }; - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GATKDoclet.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GATKDoclet.java deleted file mode 100644 index bd03add22..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GATKDoclet.java +++ /dev/null @@ -1,576 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.help; - -import com.google.gson.ExclusionStrategy; -import com.google.gson.FieldAttributes; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import com.google.gson.annotations.Expose; -import com.google.gson.stream.JsonWriter; -import com.sun.javadoc.ClassDoc; -import com.sun.javadoc.RootDoc; -import freemarker.template.Configuration; -import freemarker.template.DefaultObjectWrapper; -import freemarker.template.Template; -import freemarker.template.TemplateException; -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.tools.walkers.qc.DocumentationTest; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.text.XReadLines; - -import java.io.*; -import java.util.*; - -/** - * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker - * templates to produce PHP formatted GATKDocs for walkers - * and other classes. - *

- * This document has the following workflow: - *

- * 1 -- walk the javadoc hierarchy, looking for class that have the - * DocumentedGATKFeature annotation or are in the type hierarchy in the - * static list of things to document, and are to be documented - * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete - * set of things to document - * 3 -- for each unit, actually generate a PHP page documenting it - * as well as links to related features via their units. Writing - * of a specific class PHP is accomplished by a generate DocumentationHandler - * 4 -- write out an index of all units, organized by group - * 5 -- emit JSON version of GATKDocs using Google GSON (currently incomplete but workable) - *

- * The documented classes are restricted to only those with @DocumentedGATKFeature - * annotation or are in the STATIC_DOCS class. - */ -public class GATKDoclet { - final protected static Logger logger = Logger.getLogger(GATKDoclet.class); - - /** - * Where we find the help FreeMarker templates - */ - final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); - - /** - * Where we write the GATKDoc PHP directory - */ - final protected static File DESTINATION_DIR = new File("gatkdocs"); - - final private static String FORUM_KEY_PATH = "/local/gsa-engineering/gatkdocs_publisher/forum.key"; - // ---------------------------------------------------------------------- - // - // Global variables that are set on the command line by javadoc - // - // ---------------------------------------------------------------------- - protected static File settingsDir = SETTINGS_DIR; - protected static File destinationDir = DESTINATION_DIR; - protected static String forumKeyPath = FORUM_KEY_PATH; - protected static String buildTimestamp = null, absoluteVersion = null; - protected static boolean showHiddenFeatures = false; - - protected static boolean testOnly = false; - - /** - * Any class that's in this list will be included in the documentation - * when the -test argument is provided. Useful for debugging. - */ - private static final List> testOnlyKeepers = Arrays.asList( - DocumentationTest.class, CommandLineGATK.class, UserException.class); - - /** - * The javadoc root doc - */ - RootDoc rootDoc; - - /** - * The set of all things we are going to document - */ - Set myWorkUnits; - - /** - * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends - * one of the DocumentedGATKFeatureObjects.clazz of this collection will also - * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful - * when you want to document things that implement an interface (annotations on java - * interfaces aren't inherited) or whose base class isn't under your control (tribble - * codecs). - */ - final static Collection STATIC_DOCS = new ArrayList(); - - static { - STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, - HelpConstants.DOCS_CAT_RODCODECS, - "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED", - "NA")); - } - - /** - * Extracts the contents of certain types of javadoc and adds them to an XML file. - * - * @param rootDoc The documentation root. - * @return Whether the JavaDoc run succeeded. - * @throws java.io.IOException if output can't be written. - */ - public static boolean start(RootDoc rootDoc) throws IOException { - logger.setLevel(Level.INFO); - - // load arguments - for (String[] options : rootDoc.options()) { - if (options[0].equals("-settings-dir")) - settingsDir = new File(options[1]); - if (options[0].equals("-destination-dir")) - destinationDir = new File(options[1]); - if (options[0].equals("-forum-key-path")) - forumKeyPath = options[1]; - if (options[0].equals("-build-timestamp")) - buildTimestamp = options[1]; - if (options[0].equals("-absolute-version")) - absoluteVersion = options[1]; - if (options[0].equals("-include-hidden")) - showHiddenFeatures = true; - if (options[0].equals("-test")) - testOnly = true; - } - - if (!settingsDir.exists()) - throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " does not exist"); - else if (!settingsDir.isDirectory()) - throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " is not a directory"); - - // process the docs - new GATKDoclet().processDocs(rootDoc); - - return true; - } - - /** - * Validate the given options against options supported by this doclet. - * - * @param option Option to validate. - * @return Number of potential parameters; 0 if not supported. - */ - public static int optionLength(String option) { - if (option.equals("-settings-dir") || - option.equals("-destination-dir") || - option.equals("-forum-key-path") || - option.equals("-build-timestamp") || - option.equals("-absolute-version") || - option.equals("-include-hidden")) { - return 2; - } else if (option.equals("-test")) - return 1; - else - return 0; - } - - /** - * Are we supposed to include @Hidden annotations in our documented output? - * - * @return - */ - public boolean showHiddenFeatures() { - return showHiddenFeatures; - } - - /** - * @param rootDoc - */ - private void processDocs(RootDoc rootDoc) { - // setup the global access to the root - this.rootDoc = rootDoc; - - try { - // print the Version number - FileUtils.writeByteArrayToFile(new File(destinationDir + "/current.version.txt"), getSimpleVersion(absoluteVersion).getBytes()); - - /* ------------------------------------------------------------------- */ - /* You should do this ONLY ONCE in the whole application life-cycle: */ - - Configuration cfg = new Configuration(); - // Specify the data source where the template files come from. - cfg.setDirectoryForTemplateLoading(settingsDir); - // Specify how templates will see the data-model. This is an advanced topic... - cfg.setObjectWrapper(new DefaultObjectWrapper()); - - myWorkUnits = computeWorkUnits(); - - List> groups = new ArrayList>(); - Set seenDocumentationFeatures = new HashSet(); - List> data = new ArrayList>(); - for (GATKDocWorkUnit workUnit : myWorkUnits) { - data.add(workUnit.indexDataMap()); - if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { - groups.add(toMap(workUnit.annotation)); - seenDocumentationFeatures.add(workUnit.annotation.groupName()); - } - } - - for (GATKDocWorkUnit workUnit : myWorkUnits) { - processDocWorkUnit(cfg, workUnit, groups, data); - } - - processIndex(cfg, new ArrayList(myWorkUnits)); - - File forumKeyFile = new File(forumKeyPath); - if (forumKeyFile.exists()) { - String forumKey = null; - // Read in a one-line file so we can do a for loop - for (String line : new XReadLines(forumKeyFile)) - forumKey = line; - updateForum(myWorkUnits, forumKey); - } - } catch (FileNotFoundException e) { - throw new RuntimeException(e); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private void updateForum(Set docWorkUnits, String forumKey) { - //first get list of posts that need to be added - List old = ForumAPIUtils.getPostedTools(forumKey); - - for (String s : old) - System.out.println(s); - - System.out.printf("Forum has %d items%n", old.size()); - System.out.printf("Docs have %d items%n", docWorkUnits.size()); - - List toAdd = new ArrayList(); - for (GATKDocWorkUnit tool : docWorkUnits) { - if (!old.contains(tool.name)) { - System.out.println("WILL POST: " + tool.name + " TO FORUM"); - toAdd.add(tool); - } - } - - //update using list - for (GATKDocWorkUnit tool : toAdd) { - //if ( tool.name.equals("ApplyRecalibration") ) - ForumAPIUtils.postToForum(tool, forumKey); - } - } - - /** - * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. - * - * @return - */ - private Set computeWorkUnits() { - TreeSet m = new TreeSet(); - - for (ClassDoc doc : rootDoc.classes()) { - //logger.debug("Considering " + doc); - Class clazz = getClassForClassDoc(doc); - - // don't add anything that's not DocumentationTest if we are in test mode - if (clazz != null && testOnly && !testOnlyKeepers.contains(clazz)) - continue; - - //if ( clazz != null && clazz.getName().equals("org.broadinstitute.gatk.tools.walkers.annotator.AlleleBalance")) - // logger.debug("foo"); - - DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); - DocumentedGATKFeatureHandler handler = createHandler(doc, feature); - if (handler != null && handler.includeInDocs(doc)) { - //logger.info("Generating documentation for class " + doc); - String filename = handler.getDestinationFilename(doc, clazz); - GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), - filename, feature.groupName(), feature, handler, doc, clazz, - buildTimestamp, absoluteVersion); - m.add(unit); - } - } - - return m; - } - - /** - * Create a handler capable of documenting the class doc according to feature. Returns - * null if no appropriate handler is found or doc shouldn't be documented at all. - * - * @param doc - * @param feature - * @return - */ - private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { - if (feature != null) { - if (feature.enable()) { - DocumentedGATKFeatureHandler handler = new GenericDocumentationHandler(); - handler.setDoclet(this); - return handler; - } else { - logger.info("Skipping disabled Documentation for " + doc); - } - } - - return null; - } - - /** - * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc - * structure we will apply to Doc. - * - * @param doc - * @return null if this proves inappropriate or doc shouldn't be documented - */ - private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { - Class docClass = getClassForClassDoc(doc); - - if (docClass == null) - return null; // not annotated so it shouldn't be documented - - if (docClass.isAnnotationPresent(DocumentedGATKFeature.class)) { - DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); - return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs(), f.gotoDev()); - } else { - for (DocumentedGATKFeatureObject staticDocs : STATIC_DOCS) { - if (staticDocs.getClassToDoc().isAssignableFrom(docClass)) { - return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs(), staticDocs.gotoDev()); - } - } - return null; - } - } - - /** - * Return the Java class described by the ClassDoc doc - * - * @param doc - * @return - */ - private Class getClassForClassDoc(ClassDoc doc) { - try { - // todo -- what do I need the ? extends Object to pass the compiler? - return (Class) DocletUtils.getClassForDoc(doc); - } catch (ClassNotFoundException e) { - //logger.warn("Couldn't find class for ClassDoc " + doc); - // we got a classdoc for a class we can't find. Maybe in a library or something - return null; - } catch (NoClassDefFoundError e) { - return null; - } catch (UnsatisfiedLinkError e) { - return null; // naughty BWA bindings - } - } - - /** - * Create the php index listing all of the GATKDocs features - * - * @param cfg - * @param indexData - * @throws IOException - */ - private void processIndex(Configuration cfg, List indexData) throws IOException { - /* Get or create a template */ - Template temp = cfg.getTemplate("generic.index.template.html"); - - /* Merge data-model with template */ - Writer out = new OutputStreamWriter(new FileOutputStream(new File(destinationDir + "/index.php"))); - try { - temp.process(groupIndexData(indexData), out); - out.flush(); - } catch (TemplateException e) { - throw new ReviewedGATKException("Failed to create GATK documentation", e); - } - } - - /** - * Helpful function to create the php index. Given all of the already run GATKDocWorkUnits, - * create the high-level grouping data listing individual features by group. - * - * @param indexData - * @return - */ - private Map groupIndexData(List indexData) { - // - // root -> data -> { summary -> y, filename -> z }, etc - // -> groups -> group1, group2, etc. - Map root = new HashMap(); - - - Collections.sort(indexData); - - List> groups = new ArrayList>(); - Set seenDocumentationFeatures = new HashSet(); - List> data = new ArrayList>(); - for (GATKDocWorkUnit workUnit : indexData) { - data.add(workUnit.indexDataMap()); - if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { - groups.add(toMap(workUnit.annotation)); - seenDocumentationFeatures.add(workUnit.annotation.groupName()); - } - } - - //System.out.printf(groups.toString()); - - root.put("data", data); - root.put("groups", groups); - root.put("timestamp", buildTimestamp); - root.put("version", absoluteVersion); - - return root; - } - - /** - * Trivial helper routine that returns the map of name and summary given the annotation - * AND adds a super-category so that we can custom-order the categories in the index - * - * @param annotation - * @return - */ - private static final Map toMap(DocumentedGATKFeatureObject annotation) { - Map root = new HashMap(); - root.put("id", annotation.groupName().replaceAll("\\W", "")); - root.put("name", annotation.groupName()); - root.put("summary", annotation.summary()); - - /** - * Add-on super-category definitions. The assignments depend on parsing the names - * defined in HelpConstants.java so be careful of changing anything. - * Also, the super-category value strings need to be the same as used in the - * Freemarker template. This is all fairly clunky but the best I could do without - * making major changes to the DocumentedGATKFeatureObject. Doesn't help that - * Freemarker makes any scripting horribly awkward. - */ - final String supercatValue; - if (annotation.groupName().endsWith(" Tools")) supercatValue = "tools"; - else if (annotation.groupName().endsWith(" Utilities")) supercatValue = "utilities"; - else if (annotation.groupName().startsWith("Engine ")) supercatValue = "engine"; - else if (annotation.groupName().endsWith(" (DevZone)")) supercatValue = "dev"; - else supercatValue = "other"; - - root.put("supercat", supercatValue); - - return root; - } - - /** - * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units - * - * @param c the class we are looking for - * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found - */ - public final GATKDocWorkUnit findWorkUnitForClass(Class c) { - for (final GATKDocWorkUnit unit : this.myWorkUnits) - if (unit.clazz.equals(c)) - return unit; - return null; - } - - /** - * Return the ClassDoc associated with clazz - * - * @param clazz - * @return - */ - public ClassDoc getClassDocForClass(Class clazz) { - return rootDoc.classNamed(clazz.getName()); - } - - /** - * High-level function that processes a single DocWorkUnit unit using its handler - * - * @param cfg - * @param unit - * @param data - * @throws IOException - */ - private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, List> groups, List> data) - throws IOException { - //System.out.printf("Processing documentation for class %s%n", unit.classDoc); - unit.handler.processOne(unit); - unit.forTemplate.put("groups", groups); - unit.forTemplate.put("data", data); - // Get or create a template - Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); - - // Merge data-model with template - File outputPath = new File(destinationDir + "/" + unit.filename); - try { - Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); - temp.process(unit.forTemplate, out); - out.flush(); - } catch (TemplateException e) { - throw new ReviewedGATKException("Failed to create GATK documentation", e); - } - - // Create GSON-friendly object from unit.forTemplate - GSONWorkUnit gsonworkunit = new GSONWorkUnit(); - gsonworkunit.populate( unit.forTemplate.get("summary").toString(), - unit.forTemplate.get("parallel"), - unit.forTemplate.get("activeregion"), - unit.forTemplate.get("partitiontype").toString(), - unit.forTemplate.get("walkertype").toString(), - unit.forTemplate.get("gson-arguments"), - unit.forTemplate.get("refwindow"), - unit.forTemplate.get("description").toString(), - unit.forTemplate.get("name").toString(), - unit.forTemplate.get("annotinfo").toString(), - unit.forTemplate.get("readfilters"), - unit.forTemplate.get("downsampling"), - unit.forTemplate.get("group").toString(), - unit.forTemplate.get("annotfield").toString(), - unit.forTemplate.get("annotdescript") - ); - - // Prepare to write JSON entry to file - File outputPathForJSON = new File(destinationDir + "/" + unit.filename + ".json"); - - try { - BufferedWriter outJSON = new BufferedWriter(new FileWriter(outputPathForJSON)); - // Convert object to JSON - Gson gson = new GsonBuilder() - .serializeSpecialFloatingPointValues() - .setPrettyPrinting() - .create(); - String json = gson.toJson(gsonworkunit); // was run on unit.forTemplate - outJSON.write(json); - outJSON.close(); - - } catch (Exception e) { - throw new ReviewedGATKException("Failed to create JSON entry", e); - } - } - - private static String getSimpleVersion(String absoluteVersion) { - String[] parts = absoluteVersion.split("-"); - - // by skipping i=0, there is no trailing separator - for (int i = 1; i < 2; i++) { - parts[0] = parts[0].concat("-"); - parts[0] = parts[0].concat(parts[i]); - } - - return parts[0]; - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GenericDocumentationHandler.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GenericDocumentationHandler.java deleted file mode 100644 index fea149627..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GenericDocumentationHandler.java +++ /dev/null @@ -1,1008 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.help; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import com.sun.javadoc.ClassDoc; -import com.sun.javadoc.FieldDoc; -import com.sun.javadoc.Tag; -import org.apache.commons.lang.StringUtils; -import org.apache.log4j.Logger; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.classloader.JVMUtils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.GATKException; - -import java.io.IOException; -import java.lang.annotation.Annotation; -import java.lang.reflect.*; -import java.util.*; - -/** - * - */ -public class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { - private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); - - /** - * The max. length of the longest of --fullName -shortName argument name - * before we prefer the shorter option. - */ - private static final int MAX_DISPLAY_NAME = 30; - - /** - * The Class we are documenting - */ - private GATKDocWorkUnit toProcess; - - @Override - public boolean includeInDocs(ClassDoc doc) { - try { - Class type = DocletUtils.getClassForDoc(doc); - boolean hidden = !getDoclet().showHiddenFeatures() && type.isAnnotationPresent(Hidden.class); - return !hidden && JVMUtils.isConcrete(type); - } catch (ClassNotFoundException e) { - return false; - } - } - - - @Override - public String getTemplateName(ClassDoc doc) throws IOException { - return "generic.template.html"; - } - - @Override - public void processOne(GATKDocWorkUnit toProcessArg) { - this.toProcess = toProcessArg; - - //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); - Map root = new HashMap(); - - addHighLevelBindings(root); - addArgumentBindings(root); - addRelatedBindings(root); - root.put("group", toProcess.group); - - // Adding in retrieval of peripheral info (rf annotations etc) - getClazzAnnotations(toProcess.clazz, root); - - toProcess.setHandlerContent((String) root.get("summary"), root); - } - - /** - * Add high-level summary information about toProcess to root, such as its - * name, summary, description, version, etc. - * - * @param root - */ - protected void addHighLevelBindings(Map root) { - root.put("name", toProcess.classDoc.name()); - - // Extract overrides from the doc tags. - StringBuilder summaryBuilder = new StringBuilder(); - for (Tag tag : toProcess.classDoc.firstSentenceTags()) - summaryBuilder.append(tag.text()); - root.put("summary", summaryBuilder.toString()); - root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); - root.put("timestamp", toProcess.buildTimestamp); - root.put("version", toProcess.absoluteVersion); - - for (Tag tag : toProcess.classDoc.tags()) { - root.put(tag.name(), tag.text()); - } - - root.put("gotoDev", toProcess.annotation.gotoDev()); - } - - /** - * Add bindings describing related GATK capabilites to toProcess - * - * @param root - */ - protected void addRelatedBindings(Map root) { - List> extraDocsData = new ArrayList>(); - - // add in all of the explicitly related items - for (final Class extraDocClass : toProcess.annotation.extraDocs()) { - final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); - if (otherUnit == null) - throw new ReviewedGATKException("Requested extraDocs for class without any documentation: " + extraDocClass); - extraDocsData.add( - new HashMap() {{ - put("filename", otherUnit.filename); - put("name", otherUnit.name); - }}); - } - root.put("extradocs", extraDocsData); - } - - /** - * Add information about all of the arguments available to toProcess to root - * - * @param root - */ - protected void addArgumentBindings(Map root) { - ParsingEngine parsingEngine = createStandardGATKParsingEngine(); - - Map>> args = createArgumentMap(); - root.put("arguments", args); - try { - // loop over all of the arguments according to the parsing engine - for (final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(DocletUtils.getClassForDoc(toProcess.classDoc))) { - ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); - FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); - Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); - if (!argumentSource.isHidden() || getDoclet().showHiddenFeatures()) { - final String kind = docKindOfArg(argumentSource); - argBindings.put("kind", kind); - // Retrieve default value - final Object value = argumentValue(toProcess.clazz, argumentSource); - if (value != null) { - argBindings.put("defaultValue", prettyPrintValueString(value)); - } else { - argBindings.put("defaultValue", "NA"); - } - // Retrieve min and max / hard and soft value thresholds for numeric args - if (value instanceof Number) { - if (argumentSource.field.isAnnotationPresent(Argument.class)) { - argBindings.put("minValue", argumentSource.field.getAnnotation(Argument.class).minValue()); - argBindings.put("maxValue", argumentSource.field.getAnnotation(Argument.class).maxValue()); - if (argumentSource.field.getAnnotation(Argument.class).minRecommendedValue() != Double.NEGATIVE_INFINITY) { - argBindings.put("minRecValue", argumentSource.field.getAnnotation(Argument.class).minRecommendedValue()); - } else { - argBindings.put("minRecValue", "NA"); - } - if (argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue() != Double.POSITIVE_INFINITY) { - argBindings.put("maxRecValue", argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue()); - } else { - argBindings.put("maxRecValue", "NA"); - } - } - } else { - argBindings.put("minValue", "NA"); - argBindings.put("maxValue", "NA"); - argBindings.put("minRecValue", "NA"); - argBindings.put("maxRecValue", "NA"); - argBindings.put("defaultValue", "NA"); - } - // Finalize argument bindings - args.get(kind).add(argBindings); - args.get("all").add(argBindings); - } - } - - // sort the arguments - for (Map.Entry>> entry : args.entrySet()) { - entry.setValue(sortArguments(entry.getValue())); - } - // make a GSON-friendly map of arguments -- uses some hacky casting - List allGSONArgs = new ArrayList(); - for ( Map item : args.get("all")) { - GSONArgument itemGSONArg = new GSONArgument(); - - itemGSONArg.populate(item.get("summary").toString(), - item.get("name").toString(), - item.get("synonyms").toString(), - item.get("type").toString(), - item.get("required").toString(), - item.get("fulltext").toString(), - item.get("defaultValue").toString(), - item.get("minValue").toString(), - item.get("maxValue").toString(), - item.get("minRecValue").toString(), - item.get("maxRecValue").toString(), - item.get("rodTypes").toString(), - item.get("kind").toString(), - (List>)item.get("options") - ); - allGSONArgs.add(itemGSONArg); - } - root.put("gson-arguments", allGSONArgs); - - } catch (ClassNotFoundException e) { - throw new RuntimeException(e); - } - } - - /** - * Return the argument kind (required, advanced, hidden, etc) of this argumentSource - * - * @param argumentSource - * @return - */ - @Requires("argumentSource != null") - @Ensures("result != null") - private String docKindOfArg(ArgumentSource argumentSource) { - if (argumentSource.isRequired()) { - if (argumentSource.isInput()) return "required_in"; - else if (argumentSource.isOutput()) return "required_out"; - else if (argumentSource.isFlag()) return "required_flag"; - else return "required_param"; - } - else if (argumentSource.isAdvanced()) { - if (argumentSource.isInput()) return "advanced_in"; - else if (argumentSource.isOutput()) return "advanced_out"; - else if (argumentSource.isFlag()) return "advanced_flag"; - else return "advanced_param"; - } - else if (argumentSource.isHidden()) return "hidden"; - else if (argumentSource.isDeprecated()) return "deprecated"; - else { - if (argumentSource.isInput()) return "optional_in"; - else if (argumentSource.isOutput()) return "optional_out"; - else if (argumentSource.isFlag()) return "optional_flag"; - else return "optional_param"; - } - } - - /** - * Attempts to determine the value of argumentSource in an instantiated version of c - * - * @param c - * @param argumentSource - * @return value of argumentSource, or null if this isn't possible - */ - @Requires({"c != null", "argumentSource != null"}) - private Object argumentValue(Class c, ArgumentSource argumentSource) { - // get the value of the field - // attempt to instantiate the class - final Object instance = makeInstanceIfPossible(toProcess.clazz); - if (instance != null) { - final Object value = getFieldValue(instance, argumentSource.field.getName()); - if (value != null) - return value; - - if (argumentSource.createsTypeDefault()) { - try { // handle the case where there's an implicit default - return argumentSource.typeDefaultDocString(); - } catch (ReviewedGATKException e) { - ; // failed to create type default, don't worry about it - } - } - } - - return null; - } - - /** - * Create the argument map for holding class arguments - * - * @return - */ - private Map>> createArgumentMap() { - Map>> args = new HashMap>>(); - args.put("all", new ArrayList>()); - args.put("required_in", new ArrayList>()); - args.put("required_out", new ArrayList>()); - args.put("required_param", new ArrayList>()); - args.put("required_flag", new ArrayList>()); - args.put("optional_in", new ArrayList>()); - args.put("optional_out", new ArrayList>()); - args.put("optional_param", new ArrayList>()); - args.put("optional_flag", new ArrayList>()); - args.put("advanced_in", new ArrayList>()); - args.put("advanced_out", new ArrayList>()); - args.put("advanced_param", new ArrayList>()); - args.put("advanced_flag", new ArrayList>()); - args.put("hidden", new ArrayList>()); - args.put("deprecated", new ArrayList>()); - return args; - } - - - /** - * Sorts the individual argument list in unsorted according to CompareArgumentsByName - * - * @param unsorted - * @return - */ - private List> sortArguments(List> unsorted) { - Collections.sort(unsorted, new CompareArgumentsByName()); - return unsorted; - } - - /** - * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes - */ - private class CompareArgumentsByName implements Comparator> { - public int compare(Map x, Map y) { - return elt(x).compareTo(elt(y)); - } - - private String elt(Map m) { - String v = m.get("name").toString().toLowerCase(); - if (v.startsWith("--")) - return v.substring(2); - else if (v.startsWith("-")) - return v.substring(1); - else - throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); - } - } - - /** - * Umbrella function that groups the collection of values for specific annotations applied to an - * instance of class c. Lists of collected values are added directly to the "toProcess" object. - * Requires being able to instantiate the class. - * - * @param classToProcess the object to instantiate and query for the annotation - * @param root the root of the document handler, to which we'll store collected annotations - */ - private void getClazzAnnotations(Class classToProcess, Map root) { - // - // attempt to instantiate the class - final Object instance = makeInstanceIfPossible(classToProcess); - if (instance != null) { - final Class myClass = instance.getClass(); - // Get parallelism options - final HashSet> parallelOptions = getParallelism(myClass, new HashSet>()); - root.put("parallel", parallelOptions); - // Get annotation info (what type of annotation, standard etc.) - final HashSet annotInfo = getAnnotInfo(myClass, new HashSet()); - root.put("annotinfo", StringUtils.join(annotInfo, ", ")); - // Get annotation field (whether it goes in INFO or FORMAT) - root.put("annotfield", getAnnotField(myClass)); - // Get walker type if applicable - root.put("walkertype", getWalkerType(myClass)); - // Get partition type if applicable - root.put("partitiontype", getPartitionType(myClass)); - // Get read filter annotations (ReadFilters) if applicable - final HashSet> bucket= getReadFilters(myClass, new HashSet>()); - root.put("readfilters", bucket); - // Get default downsampling settings - final HashMap dsSettings = getDownSamplingSettings(myClass, new HashMap()); - root.put("downsampling", dsSettings); - // Get reference window size settings - final HashMap refwindow = getRefWindow(myClass, new HashMap()); - root.put("refwindow", refwindow); - // Get ActiveRegion size settings - final HashMap activeRegion = getActiveRegion(myClass, new HashMap()); - root.put("activeregion", activeRegion); - // Get annotation header line description if applicable - final Object annotDescriptLines = getAnnotDescript(instance, myClass); - root.put("annotdescript", annotDescriptLines); - - // anything else? - } else { - // put empty items to avoid blowups - root.put("parallel", new HashSet()); - root.put("annotinfo", ""); - root.put("annotfield", ""); - root.put("walkertype", ""); - root.put("partitiontype", ""); - root.put("readfilters", new HashSet>()); - root.put("downsampling", new HashMap()); - root.put("refwindow", new HashMap()); - root.put("activeregion", new HashMap()); - root.put("annotdescript", new ArrayList>()); - } - } - - /** - * Utility function that looks up annotation descriptions if applicable. - * - * @param myClass the class to query - * @return a hash map of descriptions, otherwise an empty map - */ - private Object getAnnotDescript(Object instance, Class myClass) { - // - // Check if the class has the method we want - for (Method classMethod : myClass.getMethods()) { - if (classMethod.toString().contains("getDescriptions") && classMethod.toString().contains("annotator")) { - try { - return classMethod.invoke(instance); - } catch (IllegalArgumentException e) { - } catch (IllegalAccessException e) { - } catch (InvocationTargetException e) { - } - } - } - return null; - } - - /** - * Utility function that checks which parallelism options are available for an instance of class c. - * - * @param myClass the class to query for the interfaces - * @param parallelOptions an empty HashSet in which to collect the info - * @return a hash set of parallelism options, otherwise an empty set - */ - private HashSet> getParallelism(Class myClass, HashSet> parallelOptions) { - // - // Retrieve interfaces - Class[] implementedInterfaces = myClass.getInterfaces(); - for (Class intfClass : implementedInterfaces) { - final HashMap nugget = new HashMap(); - if (intfClass.getSimpleName().equals("TreeReducible")) { - nugget.put("name", intfClass.getSimpleName()); - nugget.put("arg", HelpConstants.ARG_TREEREDUCIBLE); - nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_TREEREDUCIBLE); - } else if (intfClass.getSimpleName().equals("NanoSchedulable")) { - nugget.put("name", intfClass.getSimpleName()); - nugget.put("arg", HelpConstants.ARG_NANOSCHEDULABLE); - nugget.put("link", HelpConstants.CMDLINE_GATK_URL + "#" + HelpConstants.ARG_NANOSCHEDULABLE); - } else { - continue; - } - parallelOptions.add(nugget); - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return parallelOptions; - } - return getParallelism(mySuperClass, parallelOptions); - } - - /** - * Utility function that looks up whether the annotation goes in INFO or FORMAT field. - * - * @param myClass the class to query for the interfaces - * @return a String specifying the annotation field - */ - private final String getAnnotField(Class myClass) { - // - // Look up superclasses recursively until we find either - // GenotypeAnnotation or InfoFieldAnnotation - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass == InfoFieldAnnotation.class) { - return "INFO (variant-level)"; - } else if (mySuperClass == GenotypeAnnotation.class) { - return "FORMAT (sample genotype-level)"; - } else if (mySuperClass.getSimpleName().equals("Object")) { - return ""; - } - return getAnnotField(mySuperClass); - } - - /** - * Utility function that determines the annotation type for an instance of class c. - * - * @param myClass the class to query for the interfaces - * @param annotInfo an empty HashSet in which to collect the info - * @return a hash set of the annotation types, otherwise an empty set - */ - private HashSet getAnnotInfo(Class myClass, HashSet annotInfo) { - // - // Retrieve interfaces - Class[] implementedInterfaces = myClass.getInterfaces(); - for (Class intfClass : implementedInterfaces) { - if (intfClass.getName().contains("Annotation")) { - annotInfo.add(intfClass.getSimpleName()); - } - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return annotInfo; - } - return getAnnotInfo(mySuperClass, annotInfo); - } - - /** - * Utility function that determines the default downsampling settings for an instance of class c. - * - * @param myClass the class to query for the settings - * @param dsSettings an empty HashMap in which to collect the info - * @return a hash set of the downsampling settings, otherwise an empty set - */ - private HashMap getDownSamplingSettings(Class myClass, HashMap dsSettings) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(Downsample.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(Downsample.class); - if(thisAnnotation instanceof Downsample) { - final Downsample dsAnnotation = (Downsample) thisAnnotation; - dsSettings.put("by", dsAnnotation.by().toString()); - dsSettings.put("to_cov", dsAnnotation.toCoverage()); - } - } - return dsSettings; - } - - /** - * Utility function that determines the reference window size for an instance of class c. - * - * @param myClass the class to query for the settings - * @param refWindow an empty HashMap in which to collect the info - * @return a HashMap of the window start and stop, otherwise an empty HashMap - */ - private HashMap getRefWindow(Class myClass, HashMap refWindow) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(Reference.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(Reference.class); - if(thisAnnotation instanceof Reference) { - final Reference refAnnotation = (Reference) thisAnnotation; - refWindow.put("start", refAnnotation.window().start()); - refWindow.put("stop", refAnnotation.window().stop()); - } - } - return refWindow; - } - - /** - * Utility function that determines the ActiveRegion settings for an instance of class c. - * - * @param myClass the class to query for the settings - * @param activeRegion an empty HashMap in which to collect the info - * @return a HashMap of the ActiveRegion parameters, otherwise an empty HashMap - */ - private HashMap getActiveRegion(Class myClass, HashMap activeRegion) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(ActiveRegionTraversalParameters.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(ActiveRegionTraversalParameters.class); - if(thisAnnotation instanceof ActiveRegionTraversalParameters) { - final ActiveRegionTraversalParameters arAnnotation = (ActiveRegionTraversalParameters) thisAnnotation; - activeRegion.put("ext", arAnnotation.extension()); - activeRegion.put("max", arAnnotation.maxRegion()); - activeRegion.put("min", arAnnotation.minRegion()); - } - } - return activeRegion; - } - - /** - * Utility function that determines the partition type of an instance of class c. - * - * @param myClass the class to query for the annotation - * @return the partition type if applicable, otherwise an empty string - */ - private String getPartitionType(Class myClass) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(PartitionBy.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(PartitionBy.class); - if(thisAnnotation instanceof PartitionBy) { - final PartitionBy partAnnotation = (PartitionBy) thisAnnotation; - return partAnnotation.value().toString(); - } - } - return ""; - } - - /** - * Utility function that determines the type of walker subclassed by an instance of class c. - * - * @param myClass the class to query for the annotation - * @return the type of walker if applicable, otherwise an empty string - */ - private String getWalkerType(Class myClass) { - // - // Look up superclasses recursively until we find either Walker or Object - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Walker")) { - return myClass.getSimpleName(); - } else if (mySuperClass.getSimpleName().equals("Object")) { - return ""; - } - return getWalkerType(mySuperClass); - } - - /** - * Utility function that finds the values of ReadFilters annotation applied to an instance of class c. - * - * @param myClass the class to query for the annotation - * @param bucket a container in which we store the annotations collected - * @return a hash set of values, otherwise an empty set - */ - private HashSet> getReadFilters(Class myClass, HashSet> bucket) { - // - // Retrieve annotation - if (myClass.isAnnotationPresent(ReadFilters.class)) { - final Annotation thisAnnotation = myClass.getAnnotation(ReadFilters.class); - if(thisAnnotation instanceof ReadFilters) { - final ReadFilters rfAnnotation = (ReadFilters) thisAnnotation; - for (Class filter : rfAnnotation.value()) { - // make hashmap of simplename and url - final HashMap nugget = new HashMap(); - nugget.put("name", filter.getSimpleName()); - nugget.put("filename", GATKDocUtils.phpFilenameForClass(filter)); - bucket.add(nugget); - } - } - } - // Look up superclasses recursively - final Class mySuperClass = myClass.getSuperclass(); - if (mySuperClass.getSimpleName().equals("Object")) { - return bucket; - } - return getReadFilters(mySuperClass, bucket); - } - - - /** - * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in - * instance of class c. - * - * @param instance the object to query for the field value - * @param fieldName the name of the field we are looking for in instance - * @return The value assigned to field in the ArgumentCollection, otherwise null - */ - private Object getFieldValue(Object instance, String fieldName) { - // - // subtle note. If you have a field named X that is an ArgumentCollection that - // contains a field X as well, you need only consider fields in the argumentCollection, not - // matching the argument itself. - // - // @ArgumentCollection - // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - // - - for (Field field : JVMUtils.getAllFields(instance.getClass())) { - if (field.isAnnotationPresent(ArgumentCollection.class)) { - //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); - Object fieldValue = JVMUtils.getFieldValue(field, instance); - Object value = getFieldValue(fieldValue, fieldName); - if (value != null) - return value; - } else if (field.getName().equals(fieldName)) { - return JVMUtils.getFieldValue(field, instance); - } - } - - return null; - } - - /** - * Pretty prints value - *

- * Assumes value != null - * - * @param value - * @return - */ - private Object prettyPrintValueString(Object value) { - if (value.getClass().isArray()) { - Class type = value.getClass().getComponentType(); - if (boolean.class.isAssignableFrom(type)) - return Arrays.toString((boolean[]) value); - if (byte.class.isAssignableFrom(type)) - return Arrays.toString((byte[]) value); - if (char.class.isAssignableFrom(type)) - return Arrays.toString((char[]) value); - if (double.class.isAssignableFrom(type)) - return Arrays.toString((double[]) value); - if (float.class.isAssignableFrom(type)) - return Arrays.toString((float[]) value); - if (int.class.isAssignableFrom(type)) - return Arrays.toString((int[]) value); - if (long.class.isAssignableFrom(type)) - return Arrays.toString((long[]) value); - if (short.class.isAssignableFrom(type)) - return Arrays.toString((short[]) value); - if (Object.class.isAssignableFrom(type)) - return Arrays.toString((Object[]) value); - else - throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); - } else if (RodBinding.class.isAssignableFrom(value.getClass())) { - // annoying special case to handle the UnBound() constructor - return "none"; - } else if (value instanceof String) { - return value.equals("") ? "\"\"" : value; - } else { - return value.toString(); - } - } - - /** - * Attempt to instantiate class c, if possible. Returns null if this proves impossible. - * - * @param c - * @return - */ - private Object makeInstanceIfPossible(Class c) { - Object instance = null; - try { - // don't try to make something where we will obviously fail - if (!c.isEnum() && !c.isAnnotation() && !c.isAnonymousClass() && - !c.isArray() && !c.isPrimitive() & JVMUtils.isConcrete(c)) { - instance = c.newInstance(); - //System.out.printf("Created object of class %s => %s%n", c, instance); - return instance; - } else - return null; - } catch (IllegalAccessException e) { - } catch (InstantiationException e) { - } catch (ExceptionInInitializerError e) { - } catch (SecurityException e) { - } - // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions - // and rethrow then as RuntimeExceptions - catch (RuntimeException e) { - } - - return instance; - } - - - /** - * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet - * - * @return - */ - private ParsingEngine createStandardGATKParsingEngine() { - CommandLineProgram clp = new CommandLineGATK(); - try { - CommandLineProgram.start(clp, new String[]{}, true); - return clp.parser; - } catch (Exception e) { - throw new RuntimeException(e); - } - } - - /** - * Gets the javadocs associated with field name in classDoc. Throws a - * runtime exception if this proves impossible. - * - * @param classDoc - * @param name - * @return - */ - private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { - return getFieldDoc(classDoc, name, true); - } - - /** - * Recursive helper routine to getFieldDoc() - * - * @param classDoc - * @param name - * @param primary - * @return - */ - private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { - //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); - for (FieldDoc fieldDoc : classDoc.fields(false)) { - //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); - if (fieldDoc.name().equals(name)) - return fieldDoc; - - Field field = DocletUtils.getFieldForFieldDoc(fieldDoc); - if (field == null) - throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); - if (field.isAnnotationPresent(ArgumentCollection.class)) { - ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); - if (typeDoc == null) - throw new ReviewedGATKException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); - else { - FieldDoc result = getFieldDoc(typeDoc, name, false); - if (result != null) - return result; - // else keep searching - } - } - } - - // if we didn't find it here, wander up to the superclass to find the field - if (classDoc.superclass() != null) { - return getFieldDoc(classDoc.superclass(), name, false); - } - - if (primary) - throw new RuntimeException("No field found for expected field " + name); - else - return null; - } - - /** - * Returns a Pair of (main, synonym) names for argument with fullName s1 and - * shortName s2. - * - * Previously we had it so the main name was selected to be the longest of the two, provided - * it didn't exceed MAX_DISPLAY_NAME, in which case the shorter was taken. But we now disable - * the length-based name rearrangement in order to maintain consistency in the GATKDocs table. - * - * This may cause messed up spacing in the CLI-help display but we don't care as much about that - * since more users use the online GATKDocs for looking up arguments. - * - * @param s1 the short argument name without -, or null if not provided - * @param s2 the long argument name without --, or null if not provided - * @return A pair of fully qualified names (with - or --) for the argument. The first - * element is the primary display name while the second (potentially null) is a - * synonymous name. - */ - Pair displayNames(String s1, String s2) { - s1 = s1 == null ? null : "-" + s1; - s2 = s2 == null ? null : "--" + s2; - - if (s1 == null) return new Pair(s2, null); - if (s2 == null) return new Pair(s1, null); - - return new Pair(s2, s1); - } - - /** - * Returns a human readable string that describes the Type type of a GATK argument. - *

- * This will include parameterized types, so that Set{T} shows up as Set(T) and not - * just Set in the docs. - * - * @param type - * @return - */ - protected String argumentTypeString(Type type) { - if (type instanceof ParameterizedType) { - ParameterizedType parameterizedType = (ParameterizedType) type; - List subs = new ArrayList(); - for (Type actualType : parameterizedType.getActualTypeArguments()) - subs.add(argumentTypeString(actualType)); - return argumentTypeString(((ParameterizedType) type).getRawType()) + "[" + Utils.join(",", subs) + "]"; - } else if (type instanceof GenericArrayType) { - return argumentTypeString(((GenericArrayType) type).getGenericComponentType()) + "[]"; - } else if (type instanceof WildcardType) { - throw new RuntimeException("We don't support wildcards in arguments: " + type); - } else if (type instanceof Class) { - return ((Class) type).getSimpleName(); - } else { - throw new GATKException("Unknown type: " + type); - } - } - - /** - * Helper routine that returns the Feature.class required by a RodBinding, - * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if - * the Type doesn't fit either model. - * - * @param type - * @return - */ - protected Class getFeatureTypeIfPossible(Type type) { - if (type instanceof ParameterizedType) { - ParameterizedType paramType = (ParameterizedType) type; - if (RodBinding.class.isAssignableFrom((Class) paramType.getRawType())) { - return (Class) JVMUtils.getParameterizedTypeClass(type); - } else { - for (Type paramtype : paramType.getActualTypeArguments()) { - Class x = getFeatureTypeIfPossible(paramtype); - if (x != null) - return x; - } - } - } - - return null; - } - - /** - * High-level entry point for creating a FreeMarker map describing the GATK argument - * source with definition def, with associated javadoc fieldDoc. - * - * @param fieldDoc - * @param source - * @param def - * @return a non-null Map binding argument keys with their values - */ - protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { - Map root = new HashMap(); - Pair names = displayNames(def.shortName, def.fullName); - - root.put("name", names.getFirst()); - - if (names.getSecond() != null) { - root.put("synonyms", names.getSecond()); - } else { - root.put("synonyms", "NA"); - } - - root.put("required", def.required ? "yes" : "no"); - - // type of the field - root.put("type", argumentTypeString(source.field.getGenericType())); - - Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); - if (featureClass != null) { - // deal with the allowable types - FeatureManager manager = new FeatureManager(); - List rodTypes = new ArrayList(); - for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass)) { - rodTypes.add(String.format("%s", - GATKDocUtils.phpFilenameForClass(descriptor.getCodecClass()), - descriptor.getName())); - } - - root.put("rodTypes", Utils.join(", ", rodTypes)); - } else { - root.put("rodTypes", "NA"); - } - - // summary and fulltext - root.put("summary", def.doc != null ? def.doc : ""); - root.put("fulltext", fieldDoc.commentText()); - - // What are our enum options? - if (def.validOptions != null) { - root.put("options", docForEnumArgument(source.field.getType())); - } else { - root.put("options", new ArrayList()); - } - // general attributes - List attributes = new ArrayList(); - if (def.required) attributes.add("required"); - if (source.isDeprecated()) attributes.add("deprecated"); - if (attributes.size() > 0) { - root.put("attributes", Utils.join(", ", attributes)); - } else { - root.put("attributes", "NA"); - } - return root; - } - - /** - * Helper routine that provides a FreeMarker map for an enumClass, grabbing the - * values of the enum and their associated javadoc documentation. - * - * @param enumClass - * @return - */ - @Requires("enumClass.isEnum()") - private List> docForEnumArgument(final Class enumClass) { - final ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); - if ( doc == null ) - throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got null instead"); - - final Set enumConstantFieldNames = enumConstantsNames(enumClass); - - final List> bindings = new ArrayList>(); - for (final FieldDoc fieldDoc : doc.fields(false)) { - if (enumConstantFieldNames.contains(fieldDoc.name()) ) - bindings.add( - new HashMap() {{ - put("name", fieldDoc.name()); - put("summary", fieldDoc.commentText()); - }}); - } - - return bindings; - } - - /** - * Returns the name of the fields that are enum constants according to reflection - * - * @return a non-null set of fields that are enum constants - */ - private Set enumConstantsNames(final Class enumClass) { - final Set enumConstantFieldNames = new HashSet(); - - for ( final Field field : enumClass.getFields() ) { - if ( field.isEnumConstant() ) - enumConstantFieldNames.add(field.getName()); - } - - return enumConstantFieldNames; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java deleted file mode 100644 index 16257c6de..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java +++ /dev/null @@ -1,83 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.help; - -public class HelpConstants { - - public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; - public final static String GATK_DOCS_URL = BASE_GATK_URL + "/tooldocs/"; - public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; - public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; - - /** - * Arguments for parallelism options - */ - public final static String ARG_TREEREDUCIBLE = "-nt"; - public final static String ARG_NANOSCHEDULABLE = "-nct"; - public final static String CMDLINE_GATK_URL = GATK_DOCS_URL + "org_broadinstitute_gatk_engine_CommandLineGATK.php"; - - /** - * Definition of the group names / categories of tools. - * The names get parsed to make supercategories in the doc index, - * so be careful when making big changes -- see GATKDoclet.java toMap() - */ - public final static String DOCS_CAT_DATA = "Sequence Data Processing Tools"; - public final static String DOCS_CAT_QC = "Diagnostics and Quality Control Tools"; - public final static String DOCS_CAT_ENGINE = "Engine Parameters (available to all tools)"; - public final static String DOCS_CAT_RF = "Read Filters"; - public final static String DOCS_CAT_REFUTILS = "Reference Utilities"; - public final static String DOCS_CAT_RODCODECS = "ROD Codecs"; - public final static String DOCS_CAT_USRERR = "User Exceptions (DevZone)"; - public final static String DOCS_CAT_VALIDATION = "Validation Utilities"; - public final static String DOCS_CAT_ANNOT = "Variant Annotations"; - public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; - public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; - public final static String DOCS_CAT_TOY = "Toy Walkers (DevZone)"; - public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; - - public static String forumPost(String post) { - return GATK_FORUM_URL + post; - } - - /** - * Go-to developer name codes for tracking and display purposes. Only current team members should be in this list. - * When someone leaves, their charges should be redistributed. The actual string should be closest to the dev's - * abbreviated name or two/three-letter nickname as possible. The code can be something else if necessary to - * disambiguate from other variable. - */ - public final static String MC = "MC"; // Mauricio Carneiro - public final static String EB = "EB"; // Eric Banks - public final static String RP = "RP"; // Ryan Poplin - public final static String GVDA = "GG"; // Geraldine Van der Auwera - public final static String VRR = "VRR"; // Valentin Ruano-Rubio - public final static String ALM = "ALM"; // Ami Levy-Moonshine - public final static String BH = "BH"; // Bertrand Haas - public final static String JoT = "JT"; // Joel Thibault - public final static String DR = "DR"; // David Roazen - public final static String KS = "KS"; // Khalid Shakir - - -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpUtils.java deleted file mode 100644 index 1011a4935..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpUtils.java +++ /dev/null @@ -1,64 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.help; - -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotationType; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.gatk.utils.classloader.PluginManager; - -import java.util.List; - -/** - * NON-javadoc/doclet help-related utility methods should go here. Anything with a com.sun.javadoc.* dependency - * should go into DocletUtils for use only by doclets. - */ -public class HelpUtils { - - /** - * Simple method to print a list of available annotations. - */ - public static void listAnnotations() { - System.out.println("\nThis is a list of available Variant Annotations for use with tools such as UnifiedGenotyper, HaplotypeCaller and VariantAnnotator. Please see the Technical Documentation for more details about these annotations:"); - System.out.println("http://www.broadinstitute.org/gatk/tooldocs/"); - System.out.println("\nStandard annotations in the list below are marked with a '*'."); - List> infoAnnotationClasses = new PluginManager(InfoFieldAnnotation.class).getPlugins(); - System.out.println("\nAvailable annotations for the VCF INFO field:"); - for (int i = 0; i < infoAnnotationClasses.size(); i++) - System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(infoAnnotationClasses.get(i)) ? "*" : "") + infoAnnotationClasses.get(i).getSimpleName()); - System.out.println(); - List> genotypeAnnotationClasses = new PluginManager(GenotypeAnnotation.class).getPlugins(); - System.out.println("\nAvailable annotations for the VCF FORMAT field:"); - for (int i = 0; i < genotypeAnnotationClasses.size(); i++) - System.out.println("\t" + (StandardAnnotation.class.isAssignableFrom(genotypeAnnotationClasses.get(i)) ? "*" : "") + genotypeAnnotationClasses.get(i).getSimpleName()); - System.out.println(); - System.out.println("\nAvailable classes/groups of annotations:"); - for ( Class c : new PluginManager(AnnotationType.class).getInterfaces() ) - System.out.println("\t" + c.getSimpleName()); - System.out.println(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ResourceBundleExtractorDoclet.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ResourceBundleExtractorDoclet.java deleted file mode 100644 index f28130b69..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ResourceBundleExtractorDoclet.java +++ /dev/null @@ -1,228 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.help; - -import com.sun.javadoc.*; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.Utils; - -import java.io.*; -import java.util.*; - -/** - * Extracts certain types of javadoc (specifically package and class descriptions) and makes them available - * to applications at runtime. - * - * @author mhanna - * @version 0.1 - */ -public class ResourceBundleExtractorDoclet { - /** - * Taglet for the particular version number. - */ - public static final String VERSION_TAGLET_NAME = "version"; - public static final String SUMMARY_TAGLET_NAME = "help.summary"; - public static final String DESCRIPTION_TAGLET_NAME = "help.description"; - - /** - * Maintains a collection of resources in memory as they're accumulated. - */ - protected final Properties resourceText = new Properties(); - - /** - * Maintains a collection of classes that should really be documented. - */ - protected final Set undocumentedWalkers = new HashSet(); - - protected String buildTimestamp = null, absoluteVersion = null; - - /** - * Extracts the contents of certain types of javadoc and adds them to an XML file. - * @param rootDoc The documentation root. - * @return Whether the JavaDoc run succeeded. - * @throws IOException if output can't be written. - */ - public static boolean start(RootDoc rootDoc) throws IOException { - ResourceBundleExtractorDoclet doclet = new ResourceBundleExtractorDoclet(); - PrintStream out = doclet.loadData(rootDoc, true); - doclet.processDocs(rootDoc, out); - return true; - } - - protected PrintStream loadData(RootDoc rootDoc, boolean overwriteResourcesFile) { - PrintStream out = System.out; - - for(String[] options: rootDoc.options()) { - if(options[0].equals("-out")) { - try { - loadExistingResourceFile(options[1], rootDoc); - if ( overwriteResourcesFile ) - out = new PrintStream(options[1]); - } catch ( FileNotFoundException e ) { - throw new RuntimeException(e); - } catch ( IOException e ) { - throw new RuntimeException(e); - } - } - if(options[0].equals("-build-timestamp")) - buildTimestamp = options[1]; - if (options[0].equals("-absolute-version")) - absoluteVersion = options[1]; - } - - resourceText.setProperty("build.timestamp",buildTimestamp); - return out; - } - - protected void processDocs(RootDoc rootDoc, PrintStream out) { - // Cache packages as we see them, since there's no direct way to iterate over packages. - Set packages = new HashSet(); - - for(ClassDoc currentClass: rootDoc.classes()) { - PackageDoc containingPackage = currentClass.containingPackage(); - packages.add(containingPackage); - - if(isRequiredJavadocMissing(currentClass) && isWalker(currentClass)) - undocumentedWalkers.add(currentClass.name()); - - renderHelpText(DocletUtils.getClassName(currentClass),currentClass); - } - - for(PackageDoc currentPackage: packages) - renderHelpText(currentPackage.name(),currentPackage); - - try { - resourceText.store(out,"Strings displayed by the GATK help system"); - } catch ( FileNotFoundException e ) { - throw new RuntimeException(e); - } catch ( IOException e ) { - throw new RuntimeException(e); - } - - // ASCII codes for making text blink - final String blink = "\u001B\u005B\u0035\u006D"; - final String reset = "\u001B\u005B\u006D"; - - if(undocumentedWalkers.size() > 0) - Utils.warnUser(String.format("The following walkers are currently undocumented: %s%s%s", blink, Utils.join(" ",undocumentedWalkers), reset)); - } - - /** - * Validate the given options against options supported by this doclet. - * @param option Option to validate. - * @return Number of potential parameters; 0 if not supported. - */ - public static int optionLength(String option) { - if(option.equals("-build-timestamp") || option.equals("-out") || option.equals("-absolute-version") ) { - return 2; - } - return 0; - } - - /** - * Attempts to load the contents of the resource file named by resourceFileName into - * our in-memory resource collection resourceText. If the resource file doesn't exist, - * prints a notice to the user but does not throw an exception back to the calling method, - * since we'll just create a new resource file from scratch in that case. - * @param resourceFileName name of the resource file to attempt to load. - * @param rootDoc the documentation root. - * @throws IOException if there is an I/O-related error other than FileNotFoundException - * while attempting to read the resource file. - */ - private void loadExistingResourceFile( String resourceFileName, RootDoc rootDoc ) throws IOException { - try { - BufferedReader resourceFile = new BufferedReader(new FileReader(resourceFileName)); - try { - resourceText.load(resourceFile); - } - finally { - resourceFile.close(); - } - } - catch ( FileNotFoundException e ) { - rootDoc.printNotice("Resource file not found -- generating a new one from scratch."); - } - } - - /** - * Determine whether a given class is a walker. - * @param classDoc the type of the given class. - * @return True if the class of the given name is a walker. False otherwise. - */ - protected static boolean isWalker(ClassDoc classDoc) { - return DocletUtils.assignableToClass(classDoc, Walker.class, true); - } - - /** - * Is the javadoc for the given class missing? - * @param classDoc Class for which to inspect the JavaDoc. - * @return True if the JavaDoc is missing. False otherwise. - */ - private static boolean isRequiredJavadocMissing(ClassDoc classDoc) { - return classDoc.commentText().length() == 0 || classDoc.commentText().contains("Created by IntelliJ"); - } - - /** - * Renders all the help text required for a given name. - * @param elementName element name to use as the key - * @param element Doc element to process. - */ - private void renderHelpText(String elementName, Doc element) { - StringBuilder summaryBuilder = new StringBuilder(); - for(Tag tag: element.firstSentenceTags()) - summaryBuilder.append(tag.text()); - String summary = summaryBuilder.toString(); - String description = element.commentText(); - - // this might seem unnecessary, but the GATK command line program uses this tag to determine the version when running - if(absoluteVersion != null) - resourceText.setProperty(String.format("%s.%s",elementName,VERSION_TAGLET_NAME),absoluteVersion); - - // Write out an alternate element summary, if exists. - resourceText.setProperty(String.format("%s.%s",elementName,SUMMARY_TAGLET_NAME),formatText(summary)); - - // Write out an alternate description, if present. - resourceText.setProperty(String.format("%s.%s",elementName,DESCRIPTION_TAGLET_NAME),formatText(description)); - } - - /** - * Format text for consumption by the properties file. - * @param text Text to format. - * @return Formatted text; string trimmed, newlines removed. - */ - private static String formatText(String text) { - Scanner scanner = new Scanner(text); - StringBuilder output = new StringBuilder(); - - while(scanner.hasNextLine()) { - if(output.length() > 0) - output.append(' '); - output.append(scanner.nextLine().trim()); - } - - return output.toString(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java deleted file mode 100644 index 7fffb12e2..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java +++ /dev/null @@ -1,890 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.interval; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.util.Interval; -import htsjdk.samtools.util.IntervalList; -import htsjdk.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.IntervalArgumentCollection; -import org.broadinstitute.gatk.utils.commandline.IntervalBinding; -import org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.text.XReadLines; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -/** - * Parse text representations of interval strings that - * can appear in GATK-based applications. - * - * @author mhanna - * @version 0.1 - */ -public class IntervalUtils { - private static Logger logger = Logger.getLogger(IntervalUtils.class); - - /** - * Turns a set of strings describing intervals into a parsed set of intervals. Valid string elements can be files, - * intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons. Additionally, - * 'all' can be supplied to indicate all possible intervals, but 'all' must be exclusive of all other interval - * specifications. - * - * @param parser Genome loc parser. - * @param argList A list of strings containing interval data. - * @return an unsorted, unmerged representation of the given intervals. Null is used to indicate that all intervals should be used. - */ - public static List parseIntervalArguments(GenomeLocParser parser, List argList) { - List rawIntervals = new ArrayList(); // running list of raw GenomeLocs - - if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to - // ensure that the arg list isn't null before looping. - for (String argument : argList) { - rawIntervals.addAll(parseIntervalArguments(parser, argument)); - } - } - - return rawIntervals; - } - - public static List parseIntervalArguments(GenomeLocParser parser, String arg) { - List rawIntervals = new ArrayList(); // running list of raw GenomeLocs - - if ( arg.indexOf(';') != -1 ) { - throw new UserException.BadArgumentValue("-L " + arg, "The legacy -L \"interval1;interval2\" syntax " + - "is no longer supported. Please use one -L argument for each " + - "interval or an interval file instead."); - } - - // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. - if (isUnmapped(arg)) - rawIntervals.add(GenomeLoc.UNMAPPED); - // if it's a file, add items to raw interval list - else if (isIntervalFile(arg)) { - try { - rawIntervals.addAll(intervalFileToList(parser, arg)); - } - catch ( UserException.MalformedGenomeLoc e ) { - throw e; - } - catch ( Exception e ) { - throw new UserException.MalformedFile(arg, "Interval file could not be parsed in any supported format.", e); - } - } - // otherwise treat as an interval -> parse and add to raw interval list - else { - rawIntervals.add(parser.parseGenomeLoc(arg)); - } - - return rawIntervals; - } - - /** - * Read a file of genome locations to process. The file may be in BED, Picard, - * or GATK interval format. - * - * @param glParser GenomeLocParser - * @param file_name interval file - * @return List List of Genome Locs that have been parsed from file - */ - public static List intervalFileToList(final GenomeLocParser glParser, final String file_name) { - // try to open file - File inputFile = new File(file_name); - List ret = new ArrayList(); - - // case: BED file - if ( file_name.toUpperCase().endsWith(".BED") ) { - // this is now supported in Tribble - throw new ReviewedGATKException("BED files must be parsed through Tribble; parsing them as intervals through the GATK engine is no longer supported"); - } - else { - /** - * IF not a BED file: - * first try to read it as a Picard interval file since that's well structured - * we'll fail quickly if it's not a valid file. - */ - boolean isPicardInterval = false; - try { - // Note: Picard will skip over intervals with contigs not in the sequence dictionary - IntervalList il = IntervalList.fromFile(inputFile); - isPicardInterval = true; - - int nInvalidIntervals = 0; - for (Interval interval : il.getIntervals()) { - if ( glParser.isValidGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)) - ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)); - else { - nInvalidIntervals++; - } - } - if ( nInvalidIntervals > 0 ) - logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile); - } - - // if that didn't work, try parsing file as a GATK interval file - catch (Exception e) { - if ( isPicardInterval ) // definitely a picard file, but we failed to parse - throw new UserException.CouldNotReadInputFile(inputFile, e); - else { - try { - XReadLines reader = new XReadLines(new File(file_name)); - for(String line: reader) { - if ( line.trim().length() > 0 ) { - ret.add(glParser.parseGenomeLoc(line)); - } - } - reader.close(); - } - catch (IOException e2) { - throw new UserException.CouldNotReadInputFile(inputFile, e2); - } - } - } - } - - return ret; - } - - /** - * Returns true if the interval string is the "unmapped" interval - * @param interval Interval to check - * @return true if the interval string is the "unmapped" interval - */ - public static boolean isUnmapped(String interval) { - return (interval != null && interval.trim().toLowerCase().equals("unmapped")); - } - - /** - * merge two interval lists, using an interval set rule - * @param setOne a list of genomeLocs, in order (cannot be NULL) - * @param setTwo a list of genomeLocs, also in order (cannot be NULL) - * @param rule the rule to use for merging, i.e. union, intersection, etc - * @return a list, correctly merged using the specified rule - */ - public static List mergeListsBySetOperator(List setOne, List setTwo, IntervalSetRule rule) { - // shortcut, if either set is zero, return the other set - if (setOne == null || setOne.size() == 0 || setTwo == null || setTwo.size() == 0) - return Collections.unmodifiableList((setOne == null || setOne.size() == 0) ? setTwo : setOne); - - // our master list, since we can't guarantee removal time in a generic list - LinkedList retList = new LinkedList(); - - // if we're set to UNION, just add them all - if (rule == null || rule == IntervalSetRule.UNION) { - retList.addAll(setOne); - retList.addAll(setTwo); - return Collections.unmodifiableList(retList); - } - - // else we're INTERSECTION, create two indexes into the lists - int iOne = 0; - int iTwo = 0; - - // merge the second into the first using the rule - while (iTwo < setTwo.size() && iOne < setOne.size()) - // if the first list is ahead, drop items off the second until we overlap - if (setTwo.get(iTwo).isBefore(setOne.get(iOne))) - iTwo++; - // if the second is ahead, drop intervals off the first until we overlap - else if (setOne.get(iOne).isBefore(setTwo.get(iTwo))) - iOne++; - // we overlap, intersect the two intervals and add the result. Then remove the interval that ends first. - else { - retList.add(setOne.get(iOne).intersect(setTwo.get(iTwo))); - if (setOne.get(iOne).getStop() < setTwo.get(iTwo).getStop()) iOne++; - else iTwo++; - } - - //if we have an empty list, throw an exception. If they specified intersection and there are no items, this is bad. - if (retList.size() == 0) - throw new UserException.BadInput("The INTERSECTION of your -L options produced no intervals."); - - // we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have - return Collections.unmodifiableList(retList); - } - - /** - * Sorts and merges an interval list. Multiple techniques are available for merging: ALL, which combines - * all overlapping and abutting intervals into an interval that spans the union of all covered bases, and - * OVERLAPPING_ONLY, which unions overlapping intervals but keeps abutting intervals separate. - * - * @param parser Genome loc parser for the intervals. - * @param intervals A collection of intervals to merge. - * @param mergingRule A descriptor for the type of merging to perform. - * @return A sorted, merged version of the intervals passed in. - */ - public static GenomeLocSortedSet sortAndMergeIntervals(GenomeLocParser parser, List intervals, IntervalMergingRule mergingRule) { - // Make a copy of the (potentially unmodifiable) list to be sorted - intervals = new ArrayList(intervals); - // sort raw interval list - Collections.sort(intervals); - // now merge raw interval list - intervals = mergeIntervalLocations(intervals, mergingRule); - - return GenomeLocSortedSet.createSetFromList(parser,intervals); - } - - /** - * computes whether the test interval list is equivalent to master. To be equivalent, test must - * contain GenomeLocs covering every base in master, exactly once. Note that this algorithm - * assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but - * rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm - * doesn't assume that test has discontinuous genomelocs. - * - * Returns a null string if there are no differences, otherwise returns a string describing the difference - * (useful for UnitTests). Assumes both lists are sorted - * - * @param masterArg sorted master genome locs - * @param testArg sorted test genome locs - * @return null string if there are no difference, otherwise a string describing the difference - */ - public static String equateIntervals(List masterArg, List testArg) { - LinkedList master = new LinkedList(masterArg); - LinkedList test = new LinkedList(testArg); - - while ( ! master.isEmpty() ) { // there's still unchecked bases in master - final GenomeLoc masterHead = master.pop(); - final GenomeLoc testHead = test.pop(); - - if ( testHead.overlapsP(masterHead) ) { - // remove the parts of test that overlap master, and push the remaining - // parts onto master for further comparison. - for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) { - master.push(masterPart); - } - } else { - // testHead is incompatible with masterHead, so we must have extra bases in testHead - // that aren't in master - return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead; - } - } - - if ( test.isEmpty() ) // everything is equal - return null; // no differences - else - return "Remaining elements found in test: first=" + test.peek(); - } - - - /** - * Check if string argument was intented as a file - * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. - * @param str token to identify as a filename. - * @return true if the token looks like a filename, or false otherwise. - */ - public static boolean isIntervalFile(String str) { - return isIntervalFile(str, true); - } - - /** - * Check if string argument was intented as a file - * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. - * @param str token to identify as a filename. - * @param checkExists if true throws an exception if the file doesn't exist. - * @return true if the token looks like a filename, or false otherwise. - */ - public static boolean isIntervalFile(String str, boolean checkExists) { - // should we define list of file extensions as a public array somewhere? - // is regex or endsiwth better? - File file = new File(str); - if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") || - str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST") - || str.toUpperCase().endsWith(".INTERVALS")) { - if (!checkExists) - return true; - else if (file.exists()) - return true; - else - throw new UserException.CouldNotReadInputFile(file, "The interval file does not exist."); - } - - if(file.exists()) - throw new UserException.CouldNotReadInputFile(file, String.format("The interval file %s does not have one of " + - "the supported extensions (.bed, .list, .picard, .interval_list, or .intervals). " + - "Please rename your file with the appropriate extension. If %s is NOT supposed to be a file, " + - "please move or rename the file at location %s", str, str, file.getAbsolutePath())); - - else return false; - } - - /** - * Returns a map of contig names with their sizes. - * @param reference The reference for the intervals. - * @return A map of contig names with their sizes. - */ - public static Map getContigSizes(File reference) { - ReferenceDataSource referenceSource = new ReferenceDataSource(reference); - List locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList(); - Map lengths = new LinkedHashMap(); - for (GenomeLoc loc: locs) - lengths.put(loc.getContig(), loc.size()); - return lengths; - } - - /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. - * @param locs The genome locs to split. - * @param scatterParts The output interval lists to write to. - */ - public static void scatterContigIntervals(SAMFileHeader fileHeader, List locs, List scatterParts) { - - // Contract: must divide locs up so that each of scatterParts gets a sublist such that: - // (a) all locs concerning a particular contig go to the same part - // (b) locs are not split or combined, and remain in the same order (so scatterParts[0] + ... + scatterParts[n] == locs) - - // Locs are already sorted. - - long totalBases = 0; - for(GenomeLoc loc : locs) - totalBases += loc.size(); - - long idealBasesPerPart = totalBases / scatterParts.size(); - if(idealBasesPerPart == 0) - throw new UserException.BadInput(String.format("Genome region is too short (%d bases) to split into %d parts", totalBases, scatterParts.size())); - - // Find the indices in locs where we switch from one contig to the next. - ArrayList contigStartLocs = new ArrayList(); - String prevContig = null; - - for(int i = 0; i < locs.size(); ++i) { - - GenomeLoc loc = locs.get(i); - if(prevContig == null || !loc.getContig().equals(prevContig)) - contigStartLocs.add(i); - prevContig = loc.getContig(); - - } - - if(contigStartLocs.size() < scatterParts.size()) - throw new UserException.BadInput(String.format("Input genome region has too few contigs (%d) to split into %d parts", contigStartLocs.size(), scatterParts.size())); - - long thisPartBases = 0; - int partIdx = 0; - IntervalList outList = new IntervalList(fileHeader); - - for(int i = 0; i < locs.size(); ++i) { - - GenomeLoc loc = locs.get(i); - thisPartBases += loc.getStop() - loc.getStart(); - - outList.add(toInterval(loc, i)); - - boolean partMustStop = false; - - if(partIdx < (scatterParts.size() - 1)) { - - // If there are n contigs and n parts remaining then we must split here, - // otherwise we will run out of contigs. - - int nextPart = partIdx + 1; - int nextPartMustStartBy = contigStartLocs.get(nextPart + (contigStartLocs.size() - scatterParts.size())); - if(i + 1 == nextPartMustStartBy) - partMustStop = true; - - } - else if(i == locs.size() - 1) { - - // We're done! Write the last scatter file. - partMustStop = true; - - } - - if(partMustStop || thisPartBases > idealBasesPerPart) { - - // Ideally we would split here. However, we must make sure to do so - // on a contig boundary. Test always passes with partMustStop == true - // since that indicates we're at a contig boundary. - - GenomeLoc nextLoc = null; - if((i + 1) < locs.size()) - nextLoc = locs.get(i+1); - - if(nextLoc == null || !nextLoc.getContig().equals(loc.getContig())) { - - // Write out this part: - outList.write(scatterParts.get(partIdx)); - - // Reset. If this part ran long, leave the excess in thisPartBases - // and the next will be a little shorter to compensate. - outList = new IntervalList(fileHeader); - thisPartBases -= idealBasesPerPart; - ++partIdx; - - } - - } - - } - - } - - /** - * Splits an interval list into multiple sublists. - * @param locs The genome locs to split. - * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @return A list of lists of genome locs, split according to splits - */ - public static List> splitIntervalsToSubLists(List locs, List splits) { - int start = 0; - List> sublists = new ArrayList>(splits.size()); - for (Integer stop: splits) { - List curList = new ArrayList(); - for (int i = start; i < stop; i++) - curList.add(locs.get(i)); - start = stop; - sublists.add(curList); - } - - return sublists; - } - - - /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. - * @param splits Pre-divided genome locs returned by splitFixedIntervals. - * @param scatterParts The output interval lists to write to. - */ - public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) { - if (splits.size() != scatterParts.size()) - throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); - - int fileIndex = 0; - int locIndex = 1; - for (final List split : splits) { - IntervalList intervalList = new IntervalList(fileHeader); - for (final GenomeLoc loc : split) - intervalList.add(toInterval(loc, locIndex++)); - intervalList.write(scatterParts.get(fileIndex++)); - } - } - - /** - * Splits the genome locs up by size. - * @param locs Genome locs to split. - * @param numParts Number of parts to split the locs into. - * @return The stop points to split the genome locs. - */ - public static List> splitFixedIntervals(List locs, int numParts) { - if (locs.size() < numParts) - throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - final long locsSize = intervalSize(locs); - final List splitPoints = new ArrayList(); - addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); - Collections.sort(splitPoints); - splitPoints.add(locs.size()); - return splitIntervalsToSubLists(locs, splitPoints); - } - - @Requires({"locs != null", "numParts > 0"}) - @Ensures("result != null") - public static List> splitLocusIntervals(List locs, int numParts) { - // the ideal size of each split - final long bp = IntervalUtils.intervalSize(locs); - final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1); - - // algorithm: - // split = () - // set size = 0 - // pop the head H off locs. - // If size + size(H) < splitSize: - // add H to split, continue - // If size + size(H) == splitSize: - // done with split, put in splits, restart - // if size + size(H) > splitSize: - // cut H into two pieces, first of which has splitSize - size bp - // push both pieces onto locs, continue - // The last split is special -- when you have only one split left, it gets all of the remaining locs - // to deal with rounding issues - final List> splits = new ArrayList>(numParts); - - LinkedList locsLinkedList = new LinkedList(locs); - while ( ! locsLinkedList.isEmpty() ) { - if ( splits.size() + 1 == numParts ) { - // the last one gets all of the remaining parts - splits.add(new ArrayList(locsLinkedList)); - locsLinkedList.clear(); - } else { - final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize); - splits.add(one.split); - locsLinkedList = one.remaining; - } - } - - return splits; - } - - @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) - @Ensures({"result != null"}) - static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { - final List split = new ArrayList(); - long size = 0; - - while ( ! remaining.isEmpty() ) { - GenomeLoc head = remaining.pop(); - final long newSize = size + head.size(); - - if ( newSize == idealSplitSize ) { - split.add(head); - break; // we are done - } else if ( newSize > idealSplitSize ) { - final long remainingBp = idealSplitSize - size; - final long cutPoint = head.getStart() + remainingBp; - GenomeLoc[] parts = head.split((int)cutPoint); - remaining.push(parts[1]); - remaining.push(parts[0]); - // when we go around, head.size' = idealSplitSize - size - // so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize - } else { - split.add(head); - size = newSize; - } - } - - return new SplitLocusRecursive(split, remaining); - } - - /** - * Setup the intervals to be processed - */ - public static GenomeLocSortedSet parseIntervalBindings( - final ReferenceDataSource referenceDataSource, - final List> intervals, - final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, - final List> excludeIntervals) { - - Pair includeExcludePair = parseIntervalBindingsPair( - referenceDataSource, intervals, intervalSetRule, intervalMergingRule, intervalPadding, excludeIntervals); - - GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); - GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); - - if (excludeSortedSet != null) { - return includeSortedSet.subtractRegions(excludeSortedSet); - } else { - return includeSortedSet; - } - } - - public static GenomeLocSortedSet parseIntervalArguments(final ReferenceDataSource referenceDataSource, IntervalArgumentCollection argCollection) { - GenomeLocSortedSet intervals = null; - - // return if no interval arguments at all - if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) - return intervals; - - // Note that the use of '-L all' is no longer supported. - - // if include argument isn't given, create new set of all possible intervals - - final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( - referenceDataSource, - argCollection.intervals, - argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, - argCollection.excludeIntervals); - - final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); - final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); - - // if no exclude arguments, can return parseIntervalArguments directly - if ( excludeSortedSet == null ) - intervals = includeSortedSet; - - // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets - else { - intervals = includeSortedSet.subtractRegions(excludeSortedSet); - - // logging messages only printed when exclude (-XL) arguments are given - final long toPruneSize = includeSortedSet.coveredSize(); - final long toExcludeSize = excludeSortedSet.coveredSize(); - final long intervalSize = intervals.coveredSize(); - logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); - logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", - toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); - } - - logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); - return intervals; - } - - public static Pair parseIntervalBindingsPair( - final ReferenceDataSource referenceDataSource, - final List> intervals, - final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, - final List> excludeIntervals) { - GenomeLocParser genomeLocParser = new GenomeLocParser(referenceDataSource.getReference()); - - // if include argument isn't given, create new set of all possible intervals - GenomeLocSortedSet includeSortedSet = ((intervals == null || intervals.size() == 0) ? - GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(intervals, intervalSetRule, intervalMergingRule, intervalPadding, genomeLocParser)); - - GenomeLocSortedSet excludeSortedSet = null; - if (excludeIntervals != null && excludeIntervals.size() > 0) { - excludeSortedSet = loadIntervals(excludeIntervals, IntervalSetRule.UNION, intervalMergingRule, 0, genomeLocParser); - } - return new Pair(includeSortedSet, excludeSortedSet); - } - - public static GenomeLocSortedSet loadIntervals( - final List> intervalBindings, - final IntervalSetRule rule, final IntervalMergingRule intervalMergingRule, final int padding, - final GenomeLocParser genomeLocParser) { - List allIntervals = new ArrayList(); - for ( IntervalBinding intervalBinding : intervalBindings) { - @SuppressWarnings("unchecked") - List intervals = intervalBinding.getIntervals(genomeLocParser); - - if ( intervals.isEmpty() ) { - logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); - } - - if ( padding > 0 ) { - intervals = getIntervalsWithFlanks(genomeLocParser, intervals, padding); - } - - allIntervals = mergeListsBySetOperator(intervals, allIntervals, rule); - } - - return sortAndMergeIntervals(genomeLocParser, allIntervals, intervalMergingRule); - } - - private final static class SplitLocusRecursive { - final List split; - final LinkedList remaining; - - @Requires({"split != null", "remaining != null"}) - private SplitLocusRecursive(final List split, final LinkedList remaining) { - this.split = split; - this.remaining = remaining; - } - } - - public static List flattenSplitIntervals(List> splits) { - final List locs = new ArrayList(); - for ( final List split : splits ) - locs.addAll(split); - return locs; - } - - private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { - if (numParts < 2) - return; - int halfParts = (numParts + 1) / 2; - Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts); - int splitIndex = splitPoint.first; - long splitSize = splitPoint.second; - splitPoints.add(splitIndex); - addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); - addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); - } - - private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) { - int splitIndex = startIndex; - long splitSize = 0; - for (int i = 0; i < minLocs; i++) { - splitSize += locs.get(splitIndex).size(); - splitIndex++; - } - long halfSize = locsSize / 2; - while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) { - splitSize += locs.get(splitIndex).size(); - splitIndex++; - } - return new Pair(splitIndex, splitSize); - } - - /** - * Converts a GenomeLoc to a picard interval. - * @param loc The GenomeLoc. - * @param locIndex The loc index for use in the file. - * @return The picard interval. - */ - private static htsjdk.samtools.util.Interval toInterval(GenomeLoc loc, int locIndex) { - return new htsjdk.samtools.util.Interval(loc.getContig(), loc.getStart(), loc.getStop(), false, "interval_" + locIndex); - } - - /** - * merge a list of genome locs that may be overlapping, returning the list of unique genomic locations - * - * @param raw the unchecked genome loc list - * @param rule the merging rule we're using - * - * @return the list of merged locations - */ - public static List mergeIntervalLocations(final List raw, IntervalMergingRule rule) { - if (raw.size() <= 1) - return Collections.unmodifiableList(raw); - else { - ArrayList merged = new ArrayList(); - Iterator it = raw.iterator(); - GenomeLoc prev = it.next(); - while (it.hasNext()) { - GenomeLoc curr = it.next(); - if (prev.overlapsP(curr)) { - prev = prev.merge(curr); - } else if (prev.contiguousP(curr) && (rule == null || rule == IntervalMergingRule.ALL)) { - prev = prev.merge(curr); - } else { - merged.add(prev); - prev = curr; - } - } - merged.add(prev); - return Collections.unmodifiableList(merged); - } - } - - public static long intervalSize(final List locs) { - long size = 0; - for ( final GenomeLoc loc : locs ) - size += loc.size(); - return size; - } - - public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) { - ReferenceDataSource referenceDataSource = new ReferenceDataSource(reference); - GenomeLocParser parser = new GenomeLocParser(referenceDataSource.getReference()); - List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath()); - - if (originalList.isEmpty()) - throw new UserException.MalformedFile(inputIntervals, "File contains no intervals"); - - List flankingList = getFlankingIntervals(parser, originalList, basePairs); - - if (flankingList.isEmpty()) - throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals"); - - SAMFileHeader samFileHeader = new SAMFileHeader(); - samFileHeader.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); - IntervalList intervalList = new IntervalList(samFileHeader); - int i = 0; - for (GenomeLoc loc: flankingList) - intervalList.add(toInterval(loc, ++i)); - intervalList.write(flankingIntervals); - } - - /** - * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. - * @param parser A genome loc parser for creating the new intervals - * @param locs Original genome locs - * @param basePairs Number of base pairs on each side of loc - * @return The list of intervals between the locs - */ - public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) { - List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList(); - - if (sorted.size() == 0) - return Collections.emptyList(); - - LinkedHashMap> locsByContig = splitByContig(sorted); - List expanded = new ArrayList(); - for (Map.Entry> contig: locsByContig.entrySet()) { - List contigLocs = contig.getValue(); - int contigLocsSize = contigLocs.size(); - - GenomeLoc startLoc, stopLoc; - - // Create loc at start of the list - startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs); - if (startLoc != null) - expanded.add(startLoc); - - // Create locs between each loc[i] and loc[i+1] - for (int i = 0; i < contigLocsSize - 1; i++) { - stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs); - startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs); - if (stopLoc.getStop() + 1 >= startLoc.getStart()) { - // NOTE: This is different than GenomeLoc.merge() - // merge() returns a loc which covers the entire range of stop and start, - // possibly returning positions inside loc(i) or loc(i+1) - // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc - GenomeLoc merged = parser.createGenomeLoc( - stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop()); - expanded.add(merged); - } else { - expanded.add(stopLoc); - expanded.add(startLoc); - } - } - - // Create loc at the end of the list - stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs); - if (stopLoc != null) - expanded.add(stopLoc); - } - return expanded; - } - - /** - * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. - * @param parser A genome loc parser for creating the new intervals - * @param locs Original genome locs - * @param basePairs Number of base pairs on each side of loc - * @return The list of intervals between the locs - */ - public static List getIntervalsWithFlanks(final GenomeLocParser parser, final List locs, final int basePairs) { - - if (locs.size() == 0) - return Collections.emptyList(); - - final List expanded = new ArrayList(); - for ( final GenomeLoc loc : locs ) { - expanded.add(parser.createPaddedGenomeLoc(loc, basePairs)); - } - - return sortAndMergeIntervals(parser, expanded, IntervalMergingRule.ALL).toList(); - } - - private static LinkedHashMap> splitByContig(List sorted) { - LinkedHashMap> splits = new LinkedHashMap>(); - GenomeLoc last = null; - List contigLocs = null; - for (GenomeLoc loc: sorted) { - if (GenomeLoc.isUnmapped(loc)) - continue; - if (last == null || !last.onSameContig(loc)) { - contigLocs = new ArrayList(); - splits.put(loc.getContig(), contigLocs); - } - contigLocs.add(loc); - last = loc; - } - return splits; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java deleted file mode 100644 index d9b158f85..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java +++ /dev/null @@ -1,198 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecordIterator; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; -import org.broadinstitute.gatk.utils.commandline.Input; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.iterators.GATKSAMRecordIterator; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; - -/** - * Caliper microbenchmark of fragment pileup - */ -public class LIBSPerformance extends CommandLineProgram { - private static Logger logger = Logger.getLogger(LIBSPerformance.class); - - @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = true) - public File samFile = null; - - @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = true) - public File referenceFile = null; - - @Argument(fullName = "L", shortName = "L", doc = "Query location", required = false) - public String location = null; - - @Argument(fullName = "dt", shortName = "dt", doc = "Enable downsampling", required = false) - public boolean downsample = false; - - @Override - public int execute() throws IOException { - final IndexedFastaSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile); - final GenomeLocParser genomeLocParser = new GenomeLocParser(reference); - - final SAMFileReader reader = new SAMFileReader(samFile); - reader.setSAMRecordFactory(new GATKSamRecordFactory()); - - SAMRecordIterator rawIterator; - if ( location == null ) - rawIterator = reader.iterator(); - else { - final GenomeLoc loc = genomeLocParser.parseGenomeLoc(location); - rawIterator = reader.query(loc.getContig(), loc.getStart(), loc.getStop(), false); - } - - final GATKSAMRecordIterator iterator = new GATKSAMRecordIterator(rawIterator); - - final Set samples = new HashSet(); - for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() ) - samples.add(rg.getSample()); - - final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(downsample, 250); - - final LocusIteratorByState libs = - new LocusIteratorByState( - iterator, - ds, - true, - genomeLocParser, - samples, - false); - - final SimpleTimer timer = new SimpleTimer().start(); - int bp = 0; - double lastElapsed = 0; - while ( libs.hasNext() ) { - AlignmentContext context = libs.next(); - bp++; - if ( timer.getElapsedTime() - lastElapsed > 10 ) { - logger.info(bp + " iterations at " + context.getLocation()); - lastElapsed = timer.getElapsedTime(); - } - } - logger.info(String.format("runtime in seconds: %.2f", timer.getElapsedTime())); - - return 0; - } - -// private void syntheticTests() { -// final int readLength = 101; -// final int nReads = 10000; -// final int locus = 1; -// -// SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); -// final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); -// -// int nIterations = 0; -// for ( final String cigar : Arrays.asList("101M", "50M10I40M", "50M10D40M") ) { -// GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength); -// read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); -// final byte[] quals = new byte[readLength]; -// for ( int i = 0; i < readLength; i++ ) -// quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); -// read.setBaseQualities(quals); -// read.setCigarString(cigar); -// -// for ( int j = 0; j < nReads; j++ ) { -// for ( int i = 0; i < rep; i++ ) { -// switch ( op ) { -// case NEW_STATE: -// { -// final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read); -// while ( alignmentStateMachine.stepForwardOnGenome() != null ) { -// nIterations++; -// } -// } -// break; -//// case OLD_STATE: -//// { -//// final SAMRecordAlignmentState alignmentStateMachine = new SAMRecordAlignmentState(read); -//// while ( alignmentStateMachine.stepForwardOnGenome() != null ) { -//// alignmentStateMachine.getRead(); -//// nIterations++; -//// } -//// } -//// break; -// case NEW_LIBS: -// { -// final List reads = Collections.nCopies(30, read); -// final org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState libs = -// new org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState( -// new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), -// LocusIteratorByStateBaseTest.createTestReadProperties(), -// genomeLocParser, -// LocusIteratorByState.sampleListForSAMWithoutReadGroups()); -// -// while ( libs.hasNext() ) { -// AlignmentContext context = libs.next(); -// } -// } -// } -// } -// } -// } -// -// System.out.printf("iterations %d%n", nIterations); -// } - - /** - * Required main method implementation. - * @param argv Command-line argument text. - * @throws Exception on error. - */ - public static void main(String[] argv) throws Exception { - int returnCode = 0; - try { - LIBSPerformance instance = new LIBSPerformance(); - start(instance, argv); - returnCode = 0; - } catch(Exception ex) { - returnCode = 1; - ex.printStackTrace(); - throw ex; - } finally { - System.exit(returnCode); - } - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIterator.java deleted file mode 100644 index 72764e4df..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIterator.java +++ /dev/null @@ -1,62 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; - -import java.util.Iterator; - -/** - * Iterator that traverses a SAM File, accumulating information on a per-locus basis - */ -public abstract class LocusIterator implements Iterable, CloseableIterator { - public Iterator iterator() { - return this; - } - - public void close() { - //this.it.close(); - } - - public abstract boolean hasNext(); - public abstract AlignmentContext next(); - - /** - * Get, if possible, the underlying LocusIteratorByState from this LocusIterator. - * - * @throws UnsupportedOperationException if we don't support this operation - * - * @return a non-null locus iterator by state - */ - public LocusIteratorByState getLIBS() { - throw new UnsupportedOperationException("This locus iterator does not support getting the underlying LocusIteratorByState"); - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java deleted file mode 100644 index aaf61900e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java +++ /dev/null @@ -1,454 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.iterators.GATKSAMRecordIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.ReadUtils; - -import java.util.*; - -/** - * Iterator that traverses a SAM File, accumulating information on a per-locus basis - * - * Produces AlignmentContext objects, that contain ReadBackedPileups of PileupElements. This - * class has its core job of converting an iterator of ordered SAMRecords into those - * RBPs. - * - * There are a few constraints on required and ensured by LIBS: - * - * -- Requires the Iterator to returns reads in coordinate sorted order, consistent with the ordering - * defined by the SAM file format. That that for performance reasons this constraint isn't actually enforced. - * The behavior of LIBS is undefined in the case where the reads are badly ordered. - * -- The reads in the ReadBackedPileup are themselves in the order of appearance of the reads from the iterator. - * That is, the pileup is ordered in a way consistent with the SAM coordinate ordering - * -- Only aligned reads with at least one on-genomic cigar operator are passed on in the pileups. That is, - * unmapped reads or reads that are all insertions (10I) or soft clipped (10S) are not passed on. - * -- LIBS can perform per-sample downsampling of a variety of kinds. - * -- Because of downsampling there's no guarantee that: - * -- A read that could be aligned to a position will actually occur in the pileup (downsampled away) - * -- A read that appears in a previous pileup that could align to a future position will actually occur - * in that pileup. That is, a read might show up at position i but be downsampled away in the pileup at j - * -- LIBS can optionally capture all of the reads that come off the iterator, before any leveling downsampling - * occurs, if requested. This allows users of LIBS to see both a ReadBackedPileup view of the data as well as - * a stream of unique, sorted reads - */ -public final class LocusIteratorByState extends LocusIterator { - /** Indicates that we shouldn't do any downsampling */ - public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1); - - /** - * our log, which we want to capture anything from this class - */ - private final static Logger logger = Logger.getLogger(LocusIteratorByState.class); - - // ----------------------------------------------------------------------------------------------------------------- - // - // member fields - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Used to create new GenomeLocs as needed - */ - private final GenomeLocParser genomeLocParser; - - /** - * A complete list of all samples that may come out of the reads. Must be - * comprehensive. - */ - private final ArrayList samples; - - /** - * The system that maps incoming reads from the iterator to their pileup states - */ - private final ReadStateManager readStates; - - /** - * Should we include reads in the pileup which are aligned with a deletion operator to the reference? - */ - private final boolean includeReadsWithDeletionAtLoci; - - /** - * The next alignment context. A non-null value means that a - * context is waiting from hasNext() for sending off to the next next() call. A null - * value means that either hasNext() has not been called at all or that - * the underlying iterator is exhausted - */ - private AlignmentContext nextAlignmentContext; - - // ----------------------------------------------------------------------------------------------------------------- - // - // constructors and other basic operations - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Create a new LocusIteratorByState - * - * @param samIterator the iterator of reads to process into pileups. Reads must be ordered - * according to standard coordinate-sorted BAM conventions - * @param readInformation meta-information about how to process the reads (i.e., should we do downsampling?) - * @param genomeLocParser used to create genome locs - * @param samples a complete list of samples present in the read groups for the reads coming from samIterator. - * This is generally just the set of read group sample fields in the SAMFileHeader. This - * list of samples may contain a null element, and all reads without read groups will - * be mapped to this null sample - */ - public LocusIteratorByState(final Iterator samIterator, - final ReadProperties readInformation, - final GenomeLocParser genomeLocParser, - final Collection samples) { - this(samIterator, - toDownsamplingInfo(readInformation), - readInformation.includeReadsWithDeletionAtLoci(), - genomeLocParser, - samples, - readInformation.keepUniqueReadListInLIBS()); - } - - /** - * Create a new LocusIteratorByState based on a SAMFileReader using reads in an iterator it - * - * Simple constructor that uses the samples in the reader, doesn't do any downsampling, - * and makes a new GenomeLocParser using the reader. This constructor will be slow(ish) - * if you continually invoke this constructor, but it's easy to make. - * - * @param reader a non-null reader - * @param it an iterator from reader that has the reads we want to use to create ReadBackPileups - */ - public LocusIteratorByState(final SAMFileReader reader, final CloseableIterator it) { - this(new GATKSAMRecordIterator(it), - new LIBSDownsamplingInfo(false, 0), - true, - new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()), - SampleUtils.getSAMFileSamples(reader.getFileHeader()), - false); - } - - /** - * Create a new LocusIteratorByState - * - * @param samIterator the iterator of reads to process into pileups. Reads must be ordered - * according to standard coordinate-sorted BAM conventions - * @param downsamplingInfo meta-information about how to downsampling the reads - * @param genomeLocParser used to create genome locs - * @param samples a complete list of samples present in the read groups for the reads coming from samIterator. - * This is generally just the set of read group sample fields in the SAMFileHeader. This - * list of samples may contain a null element, and all reads without read groups will - * be mapped to this null sample - * @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them - * available via the transferReadsFromAllPreviousPileups interface - */ - public LocusIteratorByState(final Iterator samIterator, - final LIBSDownsamplingInfo downsamplingInfo, - final boolean includeReadsWithDeletionAtLoci, - final GenomeLocParser genomeLocParser, - final Collection samples, - final boolean maintainUniqueReadsList) { - if ( samIterator == null ) throw new IllegalArgumentException("samIterator cannot be null"); - if ( downsamplingInfo == null ) throw new IllegalArgumentException("downsamplingInfo cannot be null"); - if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); - if ( samples == null ) throw new IllegalArgumentException("Samples cannot be null"); - - // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when - // there's no read data. So we need to throw this error only when samIterator.hasNext() is true - if (samples.isEmpty() && samIterator.hasNext()) { - throw new IllegalArgumentException("samples list must not be empty"); - } - - this.genomeLocParser = genomeLocParser; - this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; - this.samples = new ArrayList(samples); - this.readStates = new ReadStateManager(samIterator, this.samples, downsamplingInfo, maintainUniqueReadsList); - } - - @Override - public Iterator iterator() { - return this; - } - - /** - * Get the current location (i.e., the bp of the center of the pileup) of the pileup, or null if not anywhere yet - * - * Assumes that read states is updated to reflect the current pileup position, but not advanced to the - * next location. - * - * @return the location of the current pileup, or null if we're after all reads - */ - private GenomeLoc getLocation() { - return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // next() routine and associated collection operations - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Is there another pileup available? - * @return - */ - @Override - public boolean hasNext() { - lazyLoadNextAlignmentContext(); - return nextAlignmentContext != null; - } - - /** - * Get the next AlignmentContext available from the reads. - * - * @return a non-null AlignmentContext of the pileup after to the next genomic position covered by - * at least one read. - */ - @Override - public AlignmentContext next() { - lazyLoadNextAlignmentContext(); - if (!hasNext()) - throw new NoSuchElementException("LocusIteratorByState: out of elements."); - AlignmentContext currentAlignmentContext = nextAlignmentContext; - nextAlignmentContext = null; - return currentAlignmentContext; - } - - /** - * Move this LIBS until we are over position - * - * Will return null if cannot reach position (because we run out of data in the locus) - * - * @param position the start position of the AlignmentContext we want back - * @param stopAtFirstNonEmptySiteAfterPosition if true, we will stop as soon as we find a context with data with - * position >= position, otherwise we will return a null value - * and consume the data for the next position. This means that without - * specifying this value the LIBS will be in an indeterminate state - * after calling this function, and should be reconstructed from scratch - * for subsequent use - * @return a AlignmentContext at position, or null if this isn't possible - */ - public AlignmentContext advanceToLocus(final int position, final boolean stopAtFirstNonEmptySiteAfterPosition) { - while ( hasNext() ) { - final AlignmentContext context = next(); - - if ( context == null ) - // we ran out of data - return null; - - if ( context.getPosition() == position ) - return context; - - if ( context.getPosition() > position) - return stopAtFirstNonEmptySiteAfterPosition ? context : null; - } - - return null; - } - - /** - * Creates the next alignment context from the given state. Note that this is implemented as a - * lazy load method. nextAlignmentContext MUST BE null in order for this method to advance to the - * next entry. - */ - private void lazyLoadNextAlignmentContext() { - while (nextAlignmentContext == null && readStates.hasNext()) { - readStates.collectPendingReads(); - - final GenomeLoc location = getLocation(); - final Map fullPileup = new HashMap(); - - for (final Map.Entry sampleStatePair : readStates ) { - final String sample = sampleStatePair.getKey(); - final PerSampleReadStateManager readState = sampleStatePair.getValue(); - final Iterator iterator = readState.iterator(); - final List pile = new ArrayList(readState.size()); - - while (iterator.hasNext()) { - // state object with the read/offset information - final AlignmentStateMachine state = iterator.next(); - final GATKSAMRecord read = state.getRead(); - final CigarOperator op = state.getCigarOperator(); - - if (op == CigarOperator.N) // N's are never added to any pileup - continue; - - if (!dontIncludeReadInPileup(read, location.getStart())) { - if ( ! includeReadsWithDeletionAtLoci && op == CigarOperator.D ) { - continue; - } - - pile.add(state.makePileupElement()); - } - } - - if (! pile.isEmpty() ) // if this pileup added at least one base, add it to the full pileup - fullPileup.put(sample, new ReadBackedPileupImpl(location, pile)); - } - - readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location - if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done - nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false); - } - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // getting the list of reads - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Transfer current list of all unique reads that have ever been used in any pileup, clearing old list - * - * This list is guaranteed to only contain unique reads, even across calls to the this function. It is - * literally the unique set of reads ever seen. - * - * The list occurs in the same order as they are encountered in the underlying iterator. - * - * Takes the maintained list of submitted reads, and transfers it to the caller of this - * function. The old list of set to a new, cleanly allocated list so the caller officially - * owns the list returned by this call. This is the only way to clear the tracking - * of submitted reads, if enabled. - * - * The purpose of this function is allow users of LIBS to keep track of all of the reads pulled off the - * underlying GATKSAMRecord iterator and that appeared at any point in the list of SAMRecordAlignmentState for - * any reads. This function is intended to allow users to efficiently reconstruct the unique set of reads - * used across all pileups. This is necessary for LIBS to handle because attempting to do - * so from the pileups coming out of LIBS is extremely expensive. - * - * This functionality is only available if LIBS was created with the argument to track the reads - * - * @throws UnsupportedOperationException if called when keepingSubmittedReads is false - * - * @return the current list - */ - @Ensures("result != null") - public List transferReadsFromAllPreviousPileups() { - return readStates.transferSubmittedReads(); - } - - /** - * Get the underlying list of tracked reads. For testing only - * @return a non-null list - */ - @Ensures("result != null") - protected List getReadsFromAllPreviousPileups() { - return readStates.getSubmittedReads(); - } - - // ----------------------------------------------------------------------------------------------------------------- - // - // utility functions - // - // ----------------------------------------------------------------------------------------------------------------- - - /** - * Should this read be excluded from the pileup? - * - * Generic place to put per-base filters appropriate to LocusIteratorByState - * - * @param rec the read to potentially exclude - * @param pos the genomic position of the current alignment - * @return true if the read should be excluded from the pileup, false otherwise - */ - @Requires({"rec != null", "pos > 0"}) - private boolean dontIncludeReadInPileup(final GATKSAMRecord rec, final long pos) { - return ReadUtils.isBaseInsideAdaptor(rec, pos); - } - - /** - * Create a LIBSDownsamplingInfo object from the requested info in ReadProperties - * - * LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're - * downsampling to coverage by sample. SAMDataSource will have refrained from applying - * any downsamplers to the read stream in this case, in the expectation that LIBS will - * manage the downsampling. The reason for this is twofold: performance (don't have to - * split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling - * of reads (eg., using half of a read, and throwing the rest away). - * - * @param readInfo GATK engine information about what should be done to the reads - * @return a LIBS specific info holder about downsampling only - */ - @Requires("readInfo != null") - @Ensures("result != null") - private static LIBSDownsamplingInfo toDownsamplingInfo(final ReadProperties readInfo) { - final boolean performDownsampling = readInfo.getDownsamplingMethod() != null && - readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && - readInfo.getDownsamplingMethod().toCoverage != null; - final int coverage = performDownsampling ? readInfo.getDownsamplingMethod().toCoverage : 0; - - return new LIBSDownsamplingInfo(performDownsampling, coverage); - } - - /** - * Create a pileup element for read at offset - * - * offset must correspond to a valid read offset given the read's cigar, or an IllegalStateException will be throw - * - * @param read a read - * @param offset the offset into the bases we'd like to use in the pileup - * @return a valid PileupElement with read and at offset - */ - @Ensures("result != null") - public static PileupElement createPileupForReadAndOffset(final GATKSAMRecord read, final int offset) { - if ( read == null ) throw new IllegalArgumentException("read cannot be null"); - if ( offset < 0 || offset >= read.getReadLength() ) throw new IllegalArgumentException("Invalid offset " + offset + " outside of bounds 0 and " + read.getReadLength()); - - final AlignmentStateMachine stateMachine = new AlignmentStateMachine(read); - - while ( stateMachine.stepForwardOnGenome() != null ) { - if ( stateMachine.getReadOffset() == offset ) - return stateMachine.makePileupElement(); - } - - throw new IllegalStateException("Tried to create a pileup for read " + read + " with offset " + offset + - " but we never saw such an offset in the alignment state machine"); - } - - /** - * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list - * for the system. - */ - public static List sampleListForSAMWithoutReadGroups() { - List samples = new ArrayList(); - samples.add(null); - return samples; - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManager.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManager.java deleted file mode 100644 index e6d49c354..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManager.java +++ /dev/null @@ -1,261 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import com.google.java.contract.Requires; -import htsjdk.samtools.CigarOperator; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.downsampling.Downsampler; -import org.broadinstitute.gatk.engine.downsampling.LevelingDownsampler; - -import java.util.Iterator; -import java.util.LinkedList; -import java.util.List; - -/** - * ReadStateManager for a single sample - * - * User: depristo - * Date: 1/13/13 - * Time: 12:28 PM - */ -@Invariant({ - "readStartsAreWellOrdered()", - "! isDownsampling() || downsamplingTarget > 0", - "nSites >= 0", - "nSitesNeedingDownsampling >= 0", - "nSitesNeedingDownsampling <= nSites" -}) -final class PerSampleReadStateManager implements Iterable { - private final static Logger logger = Logger.getLogger(ReadStateManager.class); - private final static boolean CAPTURE_DOWNSAMPLING_STATS = false; - - /** - * A list (potentially empty) of alignment state machines. - * - * The state machines must be ordered by the alignment start of their underlying reads, with the - * lowest alignment starts on the left, and the largest on the right - */ - private LinkedList readStatesByAlignmentStart = new LinkedList(); - - private final Downsampler> levelingDownsampler; - private final int downsamplingTarget; - - /** - * The number of sites where downsampling has been invoked - */ - private int nSitesNeedingDownsampling = 0; - - /** - * The number of sites we've visited - */ - private int nSites = 0; - - /** - * Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo - * @param LIBSDownsamplingInfo the downsampling params we want to use - */ - public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { - this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; - this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() - ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) - : null; - } - - /** - * Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines, - * where each list contains machines with a unique genome site. The outer list is ordered - * by alignment start. - * - * For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then - * the resulting grouping will be [[10, 10], [11], [12, 12], [13]]. - * - * @return a non-null list of lists - */ - @Ensures("result != null") - private List> groupByAlignmentStart() { - final LinkedList> grouped = new LinkedList>(); - - AlignmentStateMachine last = null; - for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) { - if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) { - // we've advanced to a place where the state machine has a different state, - // so start a new list - grouped.add(new LinkedList()); - last = stateMachine; - } - grouped.getLast().add(stateMachine); - } - - return grouped; - } - - /** - * Flattens the grouped list of list of alignment state machines into a single list in order - * @return a non-null list contains the state machines - */ - @Ensures("result != null") - private LinkedList flattenByAlignmentStart(final List> grouped) { - final LinkedList flat = new LinkedList(); - for ( final List l : grouped ) - flat.addAll(l); - return flat; - } - - /** - * Test that the reads are ordered by their alignment starts - * @return true if well ordered, false otherwise - */ - private boolean readStartsAreWellOrdered() { - int lastStart = -1; - for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) { - if ( lastStart > machine.getRead().getAlignmentStart() ) - return false; - lastStart = machine.getRead().getAlignmentStart(); - } - return true; - } - - /** - * Assumes it can just keep the states linked lists without making a copy - * @param states the new states to add to this manager - * @return The change in the number of states, after including states and potentially downsampling. Note - * that this return result might be negative, if downsampling is enabled, as we might drop - * more sites than have been added by the downsampler - */ - @Requires("states != null") - public int addStatesAtNextAlignmentStart(final LinkedList states) { - if ( states.isEmpty() ) { - return 0; - } - - readStatesByAlignmentStart.addAll(states); - int nStatesAdded = states.size(); - - if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) { - // only go into the downsampling branch if we are downsampling and the coverage > the target - captureDownsamplingStats(); - levelingDownsampler.submit(groupByAlignmentStart()); - levelingDownsampler.signalEndOfInput(); - - nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems(); - - // use returned List directly rather than make a copy, for efficiency's sake - readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems()); - levelingDownsampler.resetStats(); - } - - return nStatesAdded; - } - - /** - * Is downsampling enabled for this manager? - * @return true if we are downsampling, false otherwise - */ - private boolean isDownsampling() { - return levelingDownsampler != null; - } - - /** - * Get the leftmost alignment state machine, or null if the read states is empty - * @return a potentially null AlignmentStateMachine - */ - public AlignmentStateMachine getFirst() { - return isEmpty() ? null : readStatesByAlignmentStart.getFirst(); - } - - /** - * Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true - */ - @Requires("isDownsampling()") - private void captureDownsamplingStats() { - if ( CAPTURE_DOWNSAMPLING_STATS ) { - nSites++; - final int loc = getFirst().getGenomePosition(); - String message = "Pass through"; - final boolean downsampling = size() > downsamplingTarget; - if ( downsampling ) { - nSitesNeedingDownsampling++; - message = "Downsampling"; - } - - if ( downsampling || nSites % 10000 == 0 ) - logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", - message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); - } - } - - /** - * Is there at least one alignment for this sample in this manager? - * @return true if there's at least one alignment, false otherwise - */ - public boolean isEmpty() { - return readStatesByAlignmentStart.isEmpty(); - } - - /** - * Get the number of read states currently in this manager - * @return the number of read states - */ - @Ensures("result >= 0") - public int size() { - return readStatesByAlignmentStart.size(); - } - - /** - * Advances all read states forward by one element, removing states that are - * no long aligned to the current position. - * @return the number of states we're removed after advancing - */ - public int updateReadStates() { - int nRemoved = 0; - final Iterator it = iterator(); - while (it.hasNext()) { - final AlignmentStateMachine state = it.next(); - final CigarOperator op = state.stepForwardOnGenome(); - if (op == null) { - // we discard the read only when we are past its end AND indel at the end of the read (if any) was - // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe - // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. - it.remove(); // we've stepped off the end of the object - nRemoved++; - } - } - - return nRemoved; - } - - /** - * Iterate over the AlignmentStateMachine in this manager in alignment start order. - * @return a valid iterator - */ - @Ensures("result != null") - public Iterator iterator() { - return readStatesByAlignmentStart.iterator(); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/SamplePartitioner.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/SamplePartitioner.java deleted file mode 100644 index 825cb350a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/SamplePartitioner.java +++ /dev/null @@ -1,172 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.downsampling.Downsampler; -import org.broadinstitute.gatk.engine.downsampling.PassThroughDownsampler; -import org.broadinstitute.gatk.engine.downsampling.ReservoirDownsampler; - -import java.util.*; - -/** - * Divides reads by sample and (if requested) does a preliminary downsampling pass - * with a ReservoirDownsampler. - * - * Note: stores reads by sample ID string, not by sample object - */ -class SamplePartitioner { - /** - * Map from sample name (as a string) to a downsampler of reads for that sample - */ - final private Map> readsBySample; - - /** - * Are we in a state where we're done submitting reads and have semi-finalized the - * underlying per sample downsampler? - */ - boolean doneSubmittingReads = false; - - /** - * Create a new SamplePartitioner capable of splitting reads up into buckets of reads for - * each sample in samples, and perform a preliminary downsampling of these reads - * (separately for each sample) if downsampling is requested in LIBSDownsamplingInfo - * - * Note that samples must be comprehensive, in that all reads every submitted to this - * partitioner must come from one of the samples provided here. If not, submitRead - * will throw an exception. Duplicates in the list of samples will be ignored - * - * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage? - * @param samples the complete list of samples we're going to partition reads into. Can be - * empty, but in that case this code cannot function properly if you - * attempt to add data to it. - */ - @Ensures({ - "readsBySample != null", - "readsBySample.size() == new HashSet(samples).size()" - }) - public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List samples) { - if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null"); - if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list"); - - readsBySample = new LinkedHashMap>(samples.size()); - for ( final String sample : samples ) { - readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo)); - } - } - - /** - * Create a new, ready to use downsampler based on the parameters in LIBSDownsamplingInfo - * @param LIBSDownsamplingInfo the parameters to use in creating the downsampler - * @return a downsampler appropriate for LIBSDownsamplingInfo. If no downsampling is requested, - * uses the PassThroughDownsampler, which does nothing at all. - */ - @Requires("LIBSDownsamplingInfo != null") - @Ensures("result != null") - private Downsampler createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { - return LIBSDownsamplingInfo.isPerformDownsampling() - ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage(), true) - : new PassThroughDownsampler(); - } - - /** - * Offer this read to the partitioner, putting it into the bucket of reads for the sample - * of read (obtained via the read's read group). - * - * If the read group is missing, uses the special "null" read group - * - * @throws IllegalStateException if the sample of read wasn't present in the original - * set of samples provided to this SamplePartitioner at construction - * - * @param read the read to add to the sample's list of reads - */ - @Requires("read != null") - @Ensures("doneSubmittingReads == false") - public void submitRead(final T read) { - final String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - final Downsampler downsampler = readsBySample.get(sampleName); - if ( downsampler == null ) - throw new IllegalStateException("Offered read with sample name " + sampleName + " to SamplePartitioner " + - "but this sample wasn't provided as one of possible samples at construction"); - - downsampler.submit(read); - doneSubmittingReads = false; - } - - /** - * Tell this partitioner that all reads in this cycle have been submitted, so that we - * can finalize whatever downsampling is required by each sample. - * - * Note that we *must* call this function before getReadsForSample, or else that - * function will exception out. - */ - @Ensures("doneSubmittingReads == true") - public void doneSubmittingReads() { - for ( final Downsampler downsampler : readsBySample.values() ) { - downsampler.signalEndOfInput(); - } - doneSubmittingReads = true; - } - - /** - * Get the final collection of reads for this sample for this cycle - * - * The cycle is defined as all of the reads that occur between - * the first call to submitRead until doneSubmittingReads is called. At that - * point additional downsampling may occur (depending on construction arguments) - * and that set of reads is returned here. - * - * Note that this function can only be called once per cycle, as underlying - * collection of reads is cleared. - * - * @param sampleName the sample we want reads for, must be present in the original samples - * @return a non-null collection of reads for sample in this cycle - */ - @Ensures("result != null") - public Collection getReadsForSample(final String sampleName) { - if ( ! doneSubmittingReads ) throw new IllegalStateException("getReadsForSample called before doneSubmittingReads was called"); - - final Downsampler downsampler = readsBySample.get(sampleName); - if ( downsampler == null ) throw new NoSuchElementException("Sample name not found"); - - return downsampler.consumeFinalizedItems(); - } - - /** - * Resets this SamplePartitioner, indicating that we're starting a new - * cycle of adding reads to each underlying downsampler. - */ - @Ensures("doneSubmittingReads == false") - public void reset() { - for ( final Downsampler downsampler : readsBySample.values() ) { - downsampler.clearItems(); - downsampler.resetStats(); - } - doneSubmittingReads = false; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java deleted file mode 100644 index 6c4460cb3..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java +++ /dev/null @@ -1,357 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import com.google.java.contract.Requires; -import htsjdk.variant.variantcontext.Allele; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; -import org.broadinstitute.gatk.utils.haplotype.Haplotype; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.Arrays; -import java.util.Collection; -import java.util.List; -import java.util.Map; -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * - * User: rpoplin - * Date: 10/16/12 - */ -public abstract class PairHMM { - protected final static Logger logger = Logger.getLogger(PairHMM.class); - - protected boolean constantsAreInitialized = false; - - protected byte[] previousHaplotypeBases; - protected int hapStartIndex; - - public enum HMM_IMPLEMENTATION { - /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ - EXACT, - /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ - ORIGINAL, - /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ - LOGLESS_CACHING, - /* Optimized AVX implementation of LOGLESS_CACHING called through JNI */ - VECTOR_LOGLESS_CACHING, - /* Debugging for vector implementation of LOGLESS_CACHING */ - DEBUG_VECTOR_LOGLESS_CACHING, - /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ - ARRAY_LOGLESS - } - - protected int maxHaplotypeLength, maxReadLength; - protected int paddedMaxReadLength, paddedMaxHaplotypeLength; - protected int paddedReadLength, paddedHaplotypeLength; - protected boolean initialized = false; - - // only used for debugging purposes - protected boolean doNotUseTristateCorrection = false; - protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } - - //debug array - protected double[] mLikelihoodArray; - - //profiling information - protected static Boolean doProfiling = true; - protected static long pairHMMComputeTime = 0; - protected long threadLocalPairHMMComputeTimeDiff = 0; - protected long startTime = 0; - - /** - * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * - * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. - * - * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM - * @param readMaxLength the max length of reads we want to use with this PairHMM - */ - public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { - if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); - if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); - - maxHaplotypeLength = haplotypeMaxLength; - maxReadLength = readMaxLength; - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - paddedMaxReadLength = readMaxLength + 1; - paddedMaxHaplotypeLength = haplotypeMaxLength + 1; - - previousHaplotypeBases = null; - - constantsAreInitialized = false; - initialized = true; - } - - /** - * Called at the end of PairHMM for a region - mostly used by the JNI implementations - */ - public void finalizeRegion() - { - ; - } - - /** - * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths - * This function is used by the JNI implementations to transfer all data once to the native code - * @param haplotypes the list of haplotypes - * @param perSampleReadList map from sample name to list of reads - * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM - * @param readMaxLength the max length of reads we want to use with this PairHMM - */ - public void initialize( final List haplotypes, final Map> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) { - initialize(readMaxLength, haplotypeMaxLength); - } - - private int findMaxReadLength(final GATKSAMRecord ... reads) { - int max = 0; - for (final GATKSAMRecord read : reads) { - final int readLength = read.getReadLength(); - if (max < readLength) - max = readLength; - } - return max; - } - - private int findMaxAlleleLength(final List alleles) { - int max = 0; - for (final Allele allele : alleles) { - final int alleleLength = allele.length(); - if (max < alleleLength) - max = alleleLength; - } - return max; - } - - protected int findMaxReadLength(final List reads) { - int listMaxReadLength = 0; - for(GATKSAMRecord read : reads){ - final int readLength = read.getReadLength(); - if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } - } - return listMaxReadLength; - } - - protected int findMaxHaplotypeLength(final Collection haplotypes) { - int listMaxHaplotypeLength = 0; - for( final Haplotype h : haplotypes) { - final int haplotypeLength = h.getBases().length; - if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } - } - return listMaxHaplotypeLength; - } - - /** - * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from - * each haplotype given base substitution, insertion, and deletion probabilities. - * - * @param processedReads reads to analyze instead of the ones present in the destination read-likelihoods. - * @param likelihoods where to store the likelihoods where position [a][r] is reserved for the likelihood of {@code reads[r]} - * conditional to {@code alleles[a]}. - * @param gcp penalty for gap continuations base array map for processed reads. - * - * @throws IllegalArgumentException - * - * @return never {@code null}. - */ - public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, - final List processedReads, - final Map gcp) { - if (processedReads.isEmpty()) - return; - if(doProfiling) - startTime = System.nanoTime(); - // (re)initialize the pairHMM only if necessary - final int readMaxLength = findMaxReadLength(processedReads); - final int haplotypeMaxLength = findMaxAlleleLength(likelihoods.alleles()); - if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) - initialize(readMaxLength, haplotypeMaxLength); - - final int readCount = processedReads.size(); - final List alleles = likelihoods.alleles(); - final int alleleCount = alleles.size(); - mLikelihoodArray = new double[readCount * alleleCount]; - int idx = 0; - int readIndex = 0; - for(final GATKSAMRecord read : processedReads){ - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); - final byte[] readInsQuals = read.getBaseInsertionQualities(); - final byte[] readDelQuals = read.getBaseDeletionQualities(); - final byte[] overallGCP = gcp.get(read); - - // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) - final boolean isFirstHaplotype = true; - for (int a = 0; a < alleleCount; a++) { - final Allele allele = alleles.get(a); - final byte[] alleleBases = allele.getBases(); - final byte[] nextAlleleBases = a == alleles.size() - 1 ? null : alleles.get(a + 1).getBases(); - final double lk = computeReadLikelihoodGivenHaplotypeLog10(alleleBases, - readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextAlleleBases); - likelihoods.set(a, readIndex, lk); - mLikelihoodArray[idx++] = lk; - } - readIndex++; - } - if(doProfiling) { - threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime); - //synchronized(doProfiling) - { - pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff; - } - } - } - - /** - * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion - * probabilities. - * - * Note on using hapStartIndex. This allows you to compute the exact true likelihood of a full haplotypes - * given a read, assuming that the previous calculation read over a full haplotype, recaching the read values, - * starting only at the place where the new haplotype bases and the previous haplotype bases different. This - * index is 0-based, and can be computed with findFirstPositionWhereHaplotypesDiffer given the two haplotypes. - * Note that this assumes that the read and all associated quals values are the same. - * - * @param haplotypeBases the full sequence (in standard SAM encoding) of the haplotype, must be >= than read bases in length - * @param readBases the bases (in standard encoding) of the read, must be <= haplotype bases in length - * @param readQuals the phred-scaled per base substitution quality scores of read. Must be the same length as readBases - * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases - * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases - * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases - * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated - * parameters are the same, and only the haplotype bases are changing underneath us - * @return the log10 probability of read coming from the haplotype under the provided error model - */ - protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final boolean recacheReadValues, - final byte[] nextHaploytpeBases) { - - if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); - if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); - if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); - if ( readBases == null ) throw new IllegalArgumentException("readBases cannot be null"); - if ( readBases.length > maxReadLength ) throw new IllegalArgumentException("readBases is too long, got " + readBases.length + " but max is " + maxReadLength); - if ( readQuals.length != readBases.length ) throw new IllegalArgumentException("Read bases and read quals aren't the same size: " + readBases.length + " vs " + readQuals.length); - if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); - if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); - if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); - - paddedReadLength = readBases.length + 1; - paddedHaplotypeLength = haplotypeBases.length + 1; - - hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; - - // Pre-compute the difference between the current haplotype and the next one to be run - // Looking ahead is necessary for the ArrayLoglessPairHMM implementation - final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); - - double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); - - if ( result > 0.0) - throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f, PairHMM: %s", new String(haplotypeBases), new String(readBases), result, this.getClass().getSimpleName())); - else if (!MathUtils.goodLog10Probability(result)) - throw new IllegalStateException("Invalid Log Probability: " + result); - - // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). - // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. - previousHaplotypeBases = haplotypeBases; - - // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype - // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart - hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; - - return result; - } - - /** - * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 - */ - @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", - "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) - protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, - final byte[] readBases, - final byte[] readQuals, - final byte[] insertionGOP, - final byte[] deletionGOP, - final byte[] overallGCP, - final int hapStartIndex, - final boolean recacheReadValues, - final int nextHapStartIndex); - - /** - * Compute the first position at which two haplotypes differ - * - * If the haplotypes are exact copies of each other, returns the min length of the two haplotypes. - * - * @param haplotype1 the first haplotype1 - * @param haplotype2 the second haplotype1 - * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same - */ - public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { - if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); - if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); - - for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { - if( haplotype1[iii] != haplotype2[iii] ) { - return iii; - } - } - - return Math.min(haplotype1.length, haplotype2.length); - } - - /** - * Use number of threads to set doProfiling flag - doProfiling iff numThreads == 1 - * This function should be called only during initialization phase - single thread phase of HC - */ - public static void setNumberOfThreads(final int numThreads) - { - doProfiling = (numThreads == 1); - if(numThreads > 1) - logger.info("Performance profiling for PairHMM is disabled because HaplotypeCaller is being run with multiple threads (-nct>1) option\nProfiling is enabled only when running in single thread mode\n"); - } - - /** - * Return the results of the computeLikelihoods function - */ - public double[] getLikelihoodArray() { return mLikelihoodArray; } - /** - * Called at the end of the program to close files, print profiling information etc - */ - public void close() - { - if(doProfiling) - System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(pairHMMComputeTime*1e-9)); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMReadyHaplotypes.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMReadyHaplotypes.java deleted file mode 100644 index 29484048e..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMReadyHaplotypes.java +++ /dev/null @@ -1,182 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.pairhmm; - -import java.util.*; - -/** - * Collection of haplotypes sorted in a conveniently way to be run efficiently by the PairHMM. - * - * TODO not yet in use but likely to be as part of making graph-base likelihood run faster. - * TODO this could be extended to the classical PairHMM implementation simplifyling the PairHMM API. - */ -public class PairHMMReadyHaplotypes implements Iterable { - - - public class Entry { - - private final byte[] bases; - - private double likelihood = Double.NaN; - - protected Entry(final byte[] bases) { - this.bases = bases; - } - - protected byte[] getBases() { - return bases; - } - - public void setLikelihood(final double lk) { - likelihood = lk; - } - - public double getLikelihood() { - return likelihood; - } - - } - - private Map> commonPrefixLength; - - private SortedSet entries; - - private int capacity; - - private final Comparator comparator = new Comparator() { - @Override - public int compare(final Entry o1, final Entry o2) { - final byte[] b1 = o1.bases; - final byte[] b2 = o2.bases; - Map b1map = commonPrefixLength.get(o1); - if (b1map == null) - commonPrefixLength.put(o1, b1map = new HashMap<>(capacity)); - Map b2map = commonPrefixLength.get(o2); - if (b2map == null) - commonPrefixLength.put(o2, b2map = new HashMap<>(capacity)); - final Integer previousI = b1map.get(o2) == null ? null : b1map.get(o2); - int i; - int result; - final int iLimit = Math.min(b1.length,b2.length); - if (previousI == null) { - for (i = 0; i < iLimit; i++) - if (b1[i] != b2[i]) - break; - b1map.put(o2,i); - b2map.put(o1,i); - } else - i = previousI; - - if (i < iLimit) - result = Byte.compare(b1[i],b2[i]); - else if (b1.length == b2.length) - result = 0; - else - result = b1.length < b2.length ? -1 : 1; - return result; - } - }; - - public PairHMMReadyHaplotypes(final int capacity) { - commonPrefixLength = new HashMap<>(capacity); - entries = new TreeSet<>(comparator); - } - - public void add(final byte[] bases) { - final Entry entry = new Entry(bases); - entries.add(entry); - } - - public int size() { - return entries.size(); - } - - @Override - public Iterator iterator() { - return new Iterator(); - } - - public class Iterator implements java.util.Iterator { - - private java.util.Iterator actualIterator; - private Entry previousEntry; - private Entry currentEntry; - private int startIndex; - private int cmp; - - private Iterator() { - actualIterator = entries.iterator(); - } - - public boolean hasNext() { - return actualIterator.hasNext(); - } - - public Entry next() { - previousEntry = currentEntry; - final Entry result = currentEntry = actualIterator.next(); - startIndex = -1; - return result; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - public byte[] bases() { - if (currentEntry == null) - throw new NoSuchElementException(); - return currentEntry.bases; - } - - public int startIndex() { - if (startIndex >= 0) - return startIndex; - else if (previousEntry == null) - return startIndex = 0; - else { - // The comparator will make sure the common-prefix-length is updated. - // The result in a field so that we avoid dead code elimination. - // perhaps I a bit paranohic but it does not harm to prevent. - cmp = comparator.compare(previousEntry,currentEntry); - return startIndex = commonPrefixLength.get(previousEntry).get(currentEntry); - } - } - - @Override - public String toString() { - return super.toString() + " cmp = " + cmp; - } - - public void setLikelihood(final double likelihood) { - if (currentEntry == null) - throw new NoSuchElementException(); - currentEntry.setLikelihood(likelihood); - } - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupImpl.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupImpl.java deleted file mode 100644 index 840fbebd1..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupImpl.java +++ /dev/null @@ -1,1040 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.pileup; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.fragments.FragmentCollection; -import org.broadinstitute.gatk.utils.fragments.FragmentUtils; -import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.BaseUtils; - -import java.util.*; - -public class ReadBackedPileupImpl implements ReadBackedPileup { - protected final GenomeLoc loc; - protected final PileupElementTracker pileupElementTracker; - - private final static int UNINITIALIZED_CACHED_INT_VALUE = -1; - - /** - * Different then number of elements due to reduced reads - */ - private int depthOfCoverage = UNINITIALIZED_CACHED_INT_VALUE; - private int nDeletions = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of deletions - private int nMQ0Reads = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of MQ0 reads - - /** - * Create a new version of a read backed pileup at loc, using the reads and their corresponding - * offsets. This pileup will contain a list, in order of the reads, of the piled bases at - * reads[i] for all i in offsets. Does not make a copy of the data, so it's not safe to - * go changing the reads. - * - * @param loc The genome loc to associate reads wotj - * @param reads - * @param offsets - */ - public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets) { - this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads, offsets); - } - - - /** - * Create a new version of a read backed pileup at loc without any aligned reads - */ - public ReadBackedPileupImpl(GenomeLoc loc) { - this(loc, new UnifiedPileupElementTracker()); - } - - /** - * Create a new version of a read backed pileup at loc, using the reads and their corresponding - * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a - * pointer to pileup. Don't go changing the data in pileup. - */ - public ReadBackedPileupImpl(GenomeLoc loc, List pileup) { - if (loc == null) throw new ReviewedGATKException("Illegal null genomeloc in ReadBackedPileup"); - if (pileup == null) throw new ReviewedGATKException("Illegal null pileup in ReadBackedPileup"); - - this.loc = loc; - this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); - } - - /** - * Optimization of above constructor where all of the cached data is provided - * - * @param loc - * @param pileup - */ - @Deprecated - public ReadBackedPileupImpl(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads) { - this(loc, pileup); - } - - protected ReadBackedPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { - this.loc = loc; - this.pileupElementTracker = tracker; - } - - public ReadBackedPileupImpl(GenomeLoc loc, Map pileupsBySample) { - this.loc = loc; - PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); - for (Map.Entry pileupEntry : pileupsBySample.entrySet()) { - tracker.addElements(pileupEntry.getKey(), pileupEntry.getValue().pileupElementTracker); - } - this.pileupElementTracker = tracker; - } - - public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset) { - this.loc = loc; - this.pileupElementTracker = readsOffsets2Pileup(reads, offset); - } - - /** - * Helper routine for converting reads and offset lists to a PileupElement list. - * - * @param reads - * @param offsets - * @return - */ - private PileupElementTracker readsOffsets2Pileup(List reads, List offsets) { - if (reads == null) throw new ReviewedGATKException("Illegal null read list in UnifiedReadBackedPileup"); - if (offsets == null) throw new ReviewedGATKException("Illegal null offsets list in UnifiedReadBackedPileup"); - if (reads.size() != offsets.size()) - throw new ReviewedGATKException("Reads and offset lists have different sizes!"); - - UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for (int i = 0; i < reads.size(); i++) { - GATKSAMRecord read = reads.get(i); - int offset = offsets.get(i); - pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important - } - - return pileup; - } - - /** - * Helper routine for converting reads and a single offset to a PileupElement list. - * - * @param reads - * @param offset - * @return - */ - private PileupElementTracker readsOffsets2Pileup(List reads, int offset) { - if (reads == null) throw new ReviewedGATKException("Illegal null read list in UnifiedReadBackedPileup"); - if (offset < 0) throw new ReviewedGATKException("Illegal offset < 0 UnifiedReadBackedPileup"); - - UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); - for (GATKSAMRecord read : reads) { - pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important - } - - return pileup; - } - - protected ReadBackedPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { - return new ReadBackedPileupImpl(loc, tracker); - } - - protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset) { - return LocusIteratorByState.createPileupForReadAndOffset(read, offset); - } - - // -------------------------------------------------------- - // - // Special 'constructors' - // - // -------------------------------------------------------- - - /** - * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this - * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy - * of the pileup (just returns this) if there are no deletions in the pileup. - * - * @return - */ - @Override - public ReadBackedPileupImpl getPileupWithoutDeletions() { - if (getNumberOfDeletions() > 0) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutDeletions(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (!p.isDeletion()) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } else { - return this; - } - } - - /** - * Returns a new ReadBackedPileup where only one read from an overlapping read - * pair is retained. If the two reads in question disagree to their basecall, - * neither read is retained. If they agree on the base, the read with the higher - * base quality observation is retained - * - * @return the newly filtered pileup - */ - @Override - public ReadBackedPileup getOverlappingFragmentFilteredPileup() { - return getOverlappingFragmentFilteredPileup(true, true); - } - - /** - * Returns a new ReadBackedPileup where only one read from an overlapping read - * pair is retained. If discardDiscordant and the two reads in question disagree to their basecall, - * neither read is retained. Otherwise, the read with the higher - * quality (base or mapping, depending on baseQualNotMapQual) observation is retained - * - * @return the newly filtered pileup - */ - @Override - public ReadBackedPileupImpl getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - } else { - Map filteredPileup = new HashMap(); - - for (PileupElement p : pileupElementTracker) { - String readName = p.getRead().getReadName(); - - // if we've never seen this read before, life is good - if (!filteredPileup.containsKey(readName)) { - filteredPileup.put(readName, p); - } else { - PileupElement existing = filteredPileup.get(readName); - - // if the reads disagree at this position, throw them both out. Otherwise - // keep the element with the higher quality score - if (discardDiscordant && existing.getBase() != p.getBase()) { - filteredPileup.remove(readName); - } else { - if (baseQualNotMapQual) { - if (existing.getQual() < p.getQual()) - filteredPileup.put(readName, p); - } - else { - if (existing.getMappingQual() < p.getMappingQual()) - filteredPileup.put(readName, p); - } - } - } - } - - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement filteredElement : filteredPileup.values()) - filteredTracker.add(filteredElement); - - return createNewPileup(loc, filteredTracker); - } - } - - - /** - * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this - * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy - * of the pileup (just returns this) if there are no MQ0 reads in the pileup. - * - * @return - */ - @Override - public ReadBackedPileupImpl getPileupWithoutMappingQualityZeroReads() { - if (getNumberOfMappingQualityZeroReads() > 0) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutMappingQualityZeroReads(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (p.getRead().getMappingQuality() > 0) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } else { - return this; - } - } - - public ReadBackedPileupImpl getPositiveStrandPileup() { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPositiveStrandPileup(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (!p.getRead().getReadNegativeStrandFlag()) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Gets the pileup consisting of only reads on the negative strand. - * - * @return A read-backed pileup consisting only of reads on the negative strand. - */ - public ReadBackedPileupImpl getNegativeStrandPileup() { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getNegativeStrandPileup(); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : tracker) { - if (p.getRead().getReadNegativeStrandFlag()) { - filteredTracker.add(p); - } - } - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Gets a pileup consisting of all those elements passed by a given filter. - * - * @param filter Filter to use when testing for elements. - * @return a pileup without the given filtered elements. - */ - public ReadBackedPileupImpl getFilteredPileup(PileupElementFilter filter) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getFilteredPileup(filter); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : pileupElementTracker) { - if (filter.allow(p)) - filteredTracker.add(p); - } - - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from - * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. - * - * @param minBaseQ - * @param minMapQ - * @return - */ - @Override - public ReadBackedPileupImpl getBaseAndMappingFilteredPileup(int minBaseQ, int minMapQ) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ, minMapQ); - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - for (PileupElement p : pileupElementTracker) { - if (p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ)) { - filteredTracker.add(p); - } - } - - return createNewPileup(loc, filteredTracker); - } - } - - /** - * Returns subset of this pileup that contains only bases with quality >= minBaseQ. - * This method allocates and returns a new instance of ReadBackedPileup. - * - * @param minBaseQ - * @return - */ - @Override - public ReadBackedPileup getBaseFilteredPileup(int minBaseQ) { - return getBaseAndMappingFilteredPileup(minBaseQ, -1); - } - - /** - * Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. - * This method allocates and returns a new instance of ReadBackedPileup. - * - * @param minMapQ - * @return - */ - @Override - public ReadBackedPileup getMappingFilteredPileup(int minMapQ) { - return getBaseAndMappingFilteredPileup(-1, minMapQ); - } - - /** - * Gets a list of the read groups represented in this pileup. - * - * @return - */ - @Override - public Collection getReadGroups() { - Set readGroups = new HashSet(); - for (PileupElement pileupElement : this) - readGroups.add(pileupElement.getRead().getReadGroup().getReadGroupId()); - return readGroups; - } - - /** - * Gets the pileup for a given read group. Horrendously inefficient at this point. - * - * @param targetReadGroupId Identifier for the read group. - * @return A read-backed pileup containing only the reads in the given read group. - */ - @Override - public ReadBackedPileupImpl getPileupForReadGroup(String targetReadGroupId) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroup(targetReadGroupId); - if (pileup != null) - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (targetReadGroupId != null) { - if (read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - /** - * Gets the pileup for a set of read groups. Horrendously inefficient at this point. - * - * @param rgSet List of identifiers for the read groups. - * @return A read-backed pileup containing only the reads in the given read groups. - */ - @Override - public ReadBackedPileupImpl getPileupForReadGroups(final HashSet rgSet) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroups(rgSet); - if (pileup != null) - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (rgSet != null && !rgSet.isEmpty()) { - if (read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - @Override - public ReadBackedPileupImpl getPileupForLane(String laneID) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForLane(laneID); - if (pileup != null) - filteredTracker.addElements(sample, pileup.pileupElementTracker); - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (laneID != null) { - if (read.getReadGroup() != null && - (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different - (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - public Collection getSamples() { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - return new HashSet(tracker.getSamples()); - } else { - Collection sampleNames = new HashSet(); - for (PileupElement p : this) { - GATKSAMRecord read = p.getRead(); - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - sampleNames.add(sampleName); - } - return sampleNames; - } - } - - /** - * Returns a pileup randomly downsampled to the desiredCoverage. - * - * TODO: delete this once the experimental downsampler stabilizes - * - * @param desiredCoverage - * @return - */ - @Override - public ReadBackedPileup getDownsampledPileup(int desiredCoverage) { - if (getNumberOfElements() <= desiredCoverage) - return this; - - // randomly choose numbers corresponding to positions in the reads list - TreeSet positions = new TreeSet(); - for (int i = 0; i < desiredCoverage; /* no update */) { - if (positions.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(getNumberOfElements()))) - i++; - } - - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - - int current = 0; - UnifiedPileupElementTracker filteredPileup = new UnifiedPileupElementTracker(); - for (PileupElement p : perSampleElements) { - if (positions.contains(current)) - filteredPileup.add(p); - current++; - - } - filteredTracker.addElements(sample, filteredPileup); - } - - return createNewPileup(loc, filteredTracker); - } else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - - Iterator positionIter = positions.iterator(); - - while (positionIter.hasNext()) { - int nextReadToKeep = (Integer) positionIter.next(); - filteredTracker.add(tracker.get(nextReadToKeep)); - } - - return createNewPileup(getLocation(), filteredTracker); - } - } - - @Override - public ReadBackedPileup getPileupForSamples(Collection sampleNames) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PileupElementTracker filteredElements = tracker.getElements(sampleNames); - return filteredElements != null ? createNewPileup(loc, filteredElements) : null; - } else { - HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. - if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - @Override - public Map getPileupsForSamples(Collection sampleNames) { - Map result = new HashMap(); - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - for (String sample : sampleNames) { - PileupElementTracker filteredElements = tracker.getElements(sample); - if (filteredElements != null) - result.put(sample, createNewPileup(loc, filteredElements)); - } - } else { - Map> trackerMap = new HashMap>(); - - for (String sample : sampleNames) { // initialize pileups for each sample - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - trackerMap.put(sample, filteredTracker); - } - for (PileupElement p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup - GATKSAMRecord read = p.getRead(); - if (read.getReadGroup() != null) { - String sample = read.getReadGroup().getSample(); - UnifiedPileupElementTracker tracker = trackerMap.get(sample); - if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest - tracker.add(p); - } - } - for (Map.Entry> entry : trackerMap.entrySet()) // create the ReadBackedPileup for each sample - result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); - } - return result; - } - - - @Override - public ReadBackedPileup getPileupForSample(String sampleName) { - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - PileupElementTracker filteredElements = tracker.getElements(sampleName); - return filteredElements != null ? createNewPileup(loc, filteredElements) : null; - } else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for (PileupElement p : pileupElementTracker) { - GATKSAMRecord read = p.getRead(); - if (sampleName != null) { - if (read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) - filteredTracker.add(p); - } else { - if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; - } - } - - // -------------------------------------------------------- - // - // iterators - // - // -------------------------------------------------------- - - /** - * The best way to access PileupElements where you only care about the bases and quals in the pileup. - *

- * for (PileupElement p : this) { doSomething(p); } - *

- * Provides efficient iteration of the data. - * - * @return - */ - @Override - public Iterator iterator() { - return new Iterator() { - private final Iterator wrappedIterator = pileupElementTracker.iterator(); - - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - public PileupElement next() { - return wrappedIterator.next(); - } - - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); - } - }; - } - - /** - * The best way to access PileupElements where you only care not only about bases and quals in the pileup - * but also need access to the index of the pileup element in the pile. - * - * for (ExtendedPileupElement p : this) { doSomething(p); } - * - * Provides efficient iteration of the data. - * - * @return - */ - - /** - * Simple useful routine to count the number of deletion bases in this pileup - * - * @return - */ - @Override - public int getNumberOfDeletions() { - if ( nDeletions == UNINITIALIZED_CACHED_INT_VALUE ) { - nDeletions = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable() ) { - if (p.isDeletion()) { - nDeletions++; - } - } - } - return nDeletions; - } - - @Override - public int getNumberOfMappingQualityZeroReads() { - if ( nMQ0Reads == UNINITIALIZED_CACHED_INT_VALUE ) { - nMQ0Reads = 0; - - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - if (p.getRead().getMappingQuality() == 0) { - nMQ0Reads++; - } - } - } - - return nMQ0Reads; - } - - /** - * @return the number of physical elements in this pileup - */ - @Override - public int getNumberOfElements() { - return pileupElementTracker.size(); - } - - /** - * @return the number of abstract elements in this pileup - */ - @Override - public int depthOfCoverage() { - if (depthOfCoverage == UNINITIALIZED_CACHED_INT_VALUE) { - depthOfCoverage = pileupElementTracker.size(); - } - return depthOfCoverage; - } - - /** - * @return true if there are 0 elements in the pileup, false otherwise - */ - @Override - public boolean isEmpty() { - return getNumberOfElements() == 0; - } - - - /** - * @return the location of this pileup - */ - @Override - public GenomeLoc getLocation() { - return loc; - } - - /** - * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according - * to BaseUtils.simpleBaseToBaseIndex for each base. - * - * @return - */ - @Override - public int[] getBaseCounts() { - int[] counts = new int[4]; - - // TODO -- can be optimized with .unorderedIterable() - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - for (final String sample : tracker.getSamples()) { - int[] countsBySample = createNewPileup(loc, tracker.getElements(sample)).getBaseCounts(); - for (int i = 0; i < counts.length; i++) - counts[i] += countsBySample[i]; - } - } else { - for (PileupElement pile : this) { - // skip deletion sites - if (!pile.isDeletion()) { - int index = BaseUtils.simpleBaseToBaseIndex((char) pile.getBase()); - if (index != -1) - counts[index]++; - } - } - } - - return counts; - } - - @Override - public String getPileupString(Character ref) { - // In the pileup format, each line represents a genomic position, consisting of chromosome name, - // coordinate, reference base, read bases, read qualities and alignment mapping qualities. - return String.format("%s %s %c %s %s", - getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate - ref, // reference base - new String(getBases()), - getQualsString()); - } - - // -------------------------------------------------------- - // - // Convenience functions that may be slow - // - // -------------------------------------------------------- - - /** - * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time - * - * @return - */ - @Override - public List getReads() { - List reads = new ArrayList(getNumberOfElements()); - for (PileupElement pile : this) { - reads.add(pile.getRead()); - } - return reads; - } - - @Override - public int getNumberOfDeletionsAfterThisElement() { - int count = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - if (p.isBeforeDeletionStart()) - count++; - } - return count; - } - - @Override - public int getNumberOfInsertionsAfterThisElement() { - int count = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - if (p.isBeforeInsertion()) - count++; - } - return count; - - } - /** - * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time - * - * @return - */ - @Override - public List getOffsets() { - List offsets = new ArrayList(getNumberOfElements()); - for (PileupElement pile : pileupElementTracker.unorderedIterable()) { - offsets.add(pile.getOffset()); - } - return offsets; - } - - /** - * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time - * - * @return - */ - @Override - public byte[] getBases() { - byte[] v = new byte[getNumberOfElements()]; - int pos = 0; - for (PileupElement pile : pileupElementTracker) { - v[pos++] = pile.getBase(); - } - return v; - } - - /** - * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time - * - * @return - */ - @Override - public byte[] getQuals() { - byte[] v = new byte[getNumberOfElements()]; - int pos = 0; - for (PileupElement pile : pileupElementTracker) { - v[pos++] = pile.getQual(); - } - return v; - } - - /** - * Get an array of the mapping qualities - * - * @return - */ - @Override - public int[] getMappingQuals() { - final int[] v = new int[getNumberOfElements()]; - int pos = 0; - for ( final PileupElement pile : pileupElementTracker ) { - v[pos++] = pile.getRead().getMappingQuality(); - } - return v; - } - - static String quals2String(byte[] quals) { - StringBuilder qualStr = new StringBuilder(); - for (int qual : quals) { - qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea - char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 - qualStr.append(qualChar); - } - - return qualStr.toString(); - } - - private String getQualsString() { - return quals2String(getQuals()); - } - - /** - * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads. - * - * @return - */ - @Override - public ReadBackedPileup getStartSortedPileup() { - - final TreeSet sortedElements = new TreeSet(new Comparator() { - @Override - public int compare(PileupElement element1, PileupElement element2) { - final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); - return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); - } - }); - - if (pileupElementTracker instanceof PerSamplePileupElementTracker) { - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; - - for (final String sample : tracker.getSamples()) { - PileupElementTracker perSampleElements = tracker.getElements(sample); - for (PileupElement pile : perSampleElements) - sortedElements.add(pile); - } - } - else { - UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; - for (PileupElement pile : tracker) - sortedElements.add(pile); - } - - UnifiedPileupElementTracker sortedTracker = new UnifiedPileupElementTracker(); - for (PileupElement pile : sortedElements) - sortedTracker.add(pile); - - return createNewPileup(loc, sortedTracker); - } - - @Override - public FragmentCollection toFragments() { - return FragmentUtils.create(this); - } - - @Override - public ReadBackedPileup copy() { - return new ReadBackedPileupImpl(loc, pileupElementTracker.copy()); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRArgumentSet.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRArgumentSet.java deleted file mode 100644 index cc41bc5c6..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRArgumentSet.java +++ /dev/null @@ -1,85 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; - -import java.io.File; - -public class BQSRArgumentSet { - // declare public, STL-style for easier and more efficient access: - private File BQSR_RECAL_FILE; - private int quantizationLevels; - private boolean disableIndelQuals; - private boolean emitOriginalQuals; - private int PRESERVE_QSCORES_LESS_THAN; - private double globalQScorePrior; - - public BQSRArgumentSet(final GATKArgumentCollection args) { - this.BQSR_RECAL_FILE = args.BQSR_RECAL_FILE; - this.quantizationLevels = args.quantizationLevels; - this.disableIndelQuals = args.disableIndelQuals; - this.emitOriginalQuals = args.emitOriginalQuals; - this.PRESERVE_QSCORES_LESS_THAN = args.PRESERVE_QSCORES_LESS_THAN; - this.globalQScorePrior = args.globalQScorePrior; - } - - public File getRecalFile() { return BQSR_RECAL_FILE; } - - public int getQuantizationLevels() { return quantizationLevels; } - - public boolean shouldDisableIndelQuals() { return disableIndelQuals; } - - public boolean shouldEmitOriginalQuals() { return emitOriginalQuals; } - - public int getPreserveQscoresLessThan() { return PRESERVE_QSCORES_LESS_THAN; } - - public double getGlobalQScorePrior() { return globalQScorePrior; } - - public void setRecalFile(final File BQSR_RECAL_FILE) { - this.BQSR_RECAL_FILE = BQSR_RECAL_FILE; - } - - public void setQuantizationLevels(final int quantizationLevels) { - this.quantizationLevels = quantizationLevels; - } - - public void setDisableIndelQuals(final boolean disableIndelQuals) { - this.disableIndelQuals = disableIndelQuals; - } - - public void setEmitOriginalQuals(final boolean emitOriginalQuals) { - this.emitOriginalQuals = emitOriginalQuals; - } - - public void setPreserveQscoresLessThan(final int PRESERVE_QSCORES_LESS_THAN) { - this.PRESERVE_QSCORES_LESS_THAN = PRESERVE_QSCORES_LESS_THAN; - } - - public void setGlobalQScorePrior(final double globalQScorePrior) { - this.globalQScorePrior = globalQScorePrior; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRMode.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRMode.java deleted file mode 100644 index a742ed452..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/BQSRMode.java +++ /dev/null @@ -1,55 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.recalibration; - -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; - -import java.lang.annotation.*; - -/** - * User: hanna - * Date: May 14, 2009 - * Time: 1:51:22 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Allows the walker to indicate what type of data it wants to consume. - */ - -@Documented -@Inherited -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.TYPE) -public @interface BQSRMode { - public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialGATKSAMFileWriter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialGATKSAMFileWriter.java deleted file mode 100644 index 0821f4604..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialGATKSAMFileWriter.java +++ /dev/null @@ -1,130 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.ProgressLoggerInterface; -import org.broadinstitute.gatk.engine.io.GATKSAMFileWriter; - -import java.util.ArrayList; -import java.util.List; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - *

- * Class ArtificialGATKSAMFileWriter - *

- * generates a fake samwriter, that you can get the output reads - * from when you're done. - */ -public class ArtificialGATKSAMFileWriter implements GATKSAMFileWriter { - - // are we closed - private boolean closed = false; - - // the SAMRecords we've added to this writer - List records = new ArrayList(); - - public void addAlignment( SAMRecord alignment ) { - records.add(alignment); - } - - public SAMFileHeader getFileHeader() { - if (records.size() > 0) { - return records.get(0).getHeader(); - } - return null; - } - - /** not much to do when we're fake */ - public void close() { - closed = true; - } - - /** - * are we closed? - * - * @return true if we're closed - */ - public boolean isClosed() { - return closed; - } - - /** - * get the records we've seen - * @return - */ - public List getRecords() { - return records; - } - - @Override - public void writeHeader(SAMFileHeader header) { - } - - @Override - public void setPresorted(boolean presorted) { - } - - @Override - public void setMaxRecordsInRam(int maxRecordsInRam) { - } - - /** - * @throws java.lang.UnsupportedOperationException No progress logging in this implementation. - */ - @Override - public void setProgressLogger(final ProgressLoggerInterface logger) { - throw new UnsupportedOperationException("Progress logging not supported"); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialMultiSampleReadStream.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialMultiSampleReadStream.java deleted file mode 100644 index 84978c1ef..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialMultiSampleReadStream.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.MergingSamRecordIterator; -import htsjdk.samtools.SamFileHeaderMerger; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileReader; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIteratorAdapter; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.*; - -/** - * Simple wrapper class that multiplexes multiple ArtificialSingleSampleReadStreams into a single stream of reads - * - * @author David Roazen - */ -public class ArtificialMultiSampleReadStream implements Iterable { - - private Collection perSampleArtificialReadStreams; - private MergingSamRecordIterator mergingIterator; - - public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { - if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { - throw new ReviewedGATKException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); - } - - this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; - } - - public Iterator iterator() { - // lazy initialization to prevent reads from being created until they're needed - initialize(); - - return mergingIterator; - } - - public GATKSAMIterator getGATKSAMIterator() { - // lazy initialization to prevent reads from being created until they're needed - initialize(); - - return GATKSAMIteratorAdapter.adapt(mergingIterator); - } - - private void initialize() { - Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); - Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); - - for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { - Collection thisStreamReads = readStream.makeReads(); - - SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), - thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); - perSampleSAMReaders.add(reader); - headers.add(reader.getFileHeader()); - } - - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); - mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIterator.java deleted file mode 100644 index 8434e158d..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIterator.java +++ /dev/null @@ -1,172 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - * - * Class ArtificialPatternedSAMIterator - * - * This class allows you to pattern the artificial sam iterator, asking for reads - * in order or out of order. - */ -public class ArtificialPatternedSAMIterator extends ArtificialSAMIterator { - - /** the pattern we're implementing */ - public enum PATTERN { - RANDOM_READS, IN_ORDER_READS; - } - - // our pattern - private final PATTERN mPattern; - - /** - * this is pretty heavy (and it could be extremely heavy, given the amount of reads they request, but it - * allows us to give them each read once, reguardless of the order specified - */ - private final int[] reads; - private final int readCount; - - /** - * create the fake iterator, given the mapping of chromosomes and read counts. If pattern - * is specified to be random, it will generate reads that are randomly placed on the current chromosome - * - * @param startingChr the starting chromosome - * @param endingChr the ending chromosome - * @param readCount the number of reads in each chromosome - * @param header the associated header - * @param pattern the pattern to implement - */ - ArtificialPatternedSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header, PATTERN pattern ) { - super(startingChr, endingChr, readCount, unmappedReadCount, header); - mPattern = pattern; - this.readCount = readCount; - reads = new int[readCount]; - - for (int x = 0; x < readCount; x++) { - reads[x] = x+1; - } - if (pattern == PATTERN.RANDOM_READS) { - // scramble a bunch of the reads - for (int y = 0; y < readCount; y++) { - int ranOne = (int) Math.round(Math.random() * ( readCount - 1 )); - int ranTwo = (int) Math.round(Math.random() * ( readCount - 1 )); - int temp = reads[ranOne]; - reads[ranOne] = reads[ranTwo]; - reads[ranTwo] = temp; - } - /** - * up to this point there's no garauntee that the random() has made the reads out of order (though it's - * extremely extremely unlikely it's failed). Let's make sure there at least out of order: - */ - if (this.reads[0] < this.reads[reads.length - 1]) { - int temp = reads[0]; - reads[0] = reads[reads.length - 1]; - reads[reads.length - 1] = temp; - } - - } - - } - - /** - * override the default ArtificialSAMIterator createNextRead method, which creates the next read - * - * @return - */ - protected boolean createNextRead() { - if (currentRead > rCount) { - currentChromo++; - currentRead = 1; - } - // check for end condition, have we finished the chromosome listing, and have no unmapped reads - if (currentChromo >= eChromosomeCount) { - if (unmappedRemaining < 1) { - this.next = null; - return false; - } else { - ++totalReadCount; - this.next = ArtificialSAMUtils.createArtificialRead(this.header, - String.valueOf(totalReadCount), - SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, - 50); - --unmappedRemaining; - return true; - } - } - ++totalReadCount; - this.next = getNextRecord(currentRead); - - ++currentRead; - return true; - } - - - /** - * get the next read, given it's index in the chromosome - * - * @param read the read index in the chromosome - * - * @return a SAMRecord - */ - private SAMRecord getNextRecord( int read ) { - if (read > this.readCount) { - return ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(reads[readCount - 1]), currentChromo, reads[readCount - 1], 50); - } - return ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(reads[read-1]), currentChromo, reads[read-1], 50); - } - -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialReadsTraversal.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialReadsTraversal.java deleted file mode 100644 index 54c2b873a..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialReadsTraversal.java +++ /dev/null @@ -1,140 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.ShardDataProvider; -import org.broadinstitute.gatk.engine.traversals.TraversalEngine; -import org.broadinstitute.gatk.engine.walkers.ReadWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - * - * this class acts as a fake reads traversal engine for testing out reads based traversals. - */ -public class ArtificialReadsTraversal extends TraversalEngine,ShardDataProvider> { - - public int startingChr = 1; - public int endingChr = 5; - public int readsPerChr = 100; - public int unMappedReads = 1000; - private int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; - private ArtificialPatternedSAMIterator iter; - /** our log, which we want to capture anything from this class */ - protected static Logger logger = Logger.getLogger(ArtificialReadsTraversal.class); - - /** Creates a new, uninitialized ArtificialReadsTraversal */ - public ArtificialReadsTraversal() { - } - - // what read ordering are we using - private ArtificialPatternedSAMIterator.PATTERN readOrder = ArtificialPatternedSAMIterator.PATTERN.IN_ORDER_READS; - - - /** - * set the read ordering of the reads given to the walker - * - * @param readOrdering - */ - public void setReadOrder( ArtificialPatternedSAMIterator.PATTERN readOrdering ) { - readOrder = readOrdering; - } - - @Override - public String getTraversalUnits() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * - * @return the reduce variable of the read walker - */ - public T traverse( Walker walker, - ShardDataProvider dataProvider, - T sum ) { - - if (!( walker instanceof ReadWalker )) - throw new IllegalArgumentException("Walker isn't a read walker!"); - - ReadWalker readWalker = (ReadWalker) walker; - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readsPerChr + DEFAULT_READ_LENGTH); - iter = new ArtificialPatternedSAMIterator(this.startingChr, - this.endingChr, - this.readsPerChr, - this.unMappedReads, - header, - this.readOrder); - - // while we still have more reads - for (SAMRecord read : iter) { - - // an array of characters that represent the reference - ReferenceContext refSeq = null; - - final boolean keepMeP = readWalker.filter(refSeq, (GATKSAMRecord) read); - if (keepMeP) { - M x = readWalker.map(refSeq, (GATKSAMRecord) read, null); // TODO: fix me at some point, it would be nice to fake out ROD data too - sum = readWalker.reduce(x, sum); - } - } - return sum; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMIterator.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMIterator.java deleted file mode 100644 index b133e9c55..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMIterator.java +++ /dev/null @@ -1,212 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; - -import java.util.Iterator; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** this fake iterator allows us to look at how specific piles of reads are handled */ -public class ArtificialSAMIterator implements GATKSAMIterator { - - - protected int currentChromo = 0; - protected int currentRead = 1; - protected int totalReadCount = 0; - protected int unmappedRemaining = 0; - protected boolean done = false; - // the next record - protected SAMRecord next = null; - protected SAMFileHeader header = null; - - // the passed in parameters - protected final int sChr; - protected final int eChromosomeCount; - protected final int rCount; - protected final int unmappedReadCount; - - // let us know to make a read, we need this to help out the fake sam query iterator - private boolean initialized = false; - - /** - * Is this iterator currently open or closed? Closed iterators can be reused. - */ - protected boolean open = false; - - /** - * create the fake iterator, given the mapping of chromosomes and read counts - * - * @param startingChr the starting chromosome - * @param endingChr the ending chromosome - * @param readCount the number of reads in each chromosome - * @param header the associated header - */ - ArtificialSAMIterator( int startingChr, int endingChr, int readCount, SAMFileHeader header ) { - sChr = startingChr; - eChromosomeCount = (endingChr - startingChr) + 1; - rCount = readCount; - this.header = header; - unmappedReadCount = 0; - reset(); - } - - protected void reset() { - this.currentChromo = 0; - this.currentRead = 1; - this.totalReadCount = 0; - this.done = false; - this.next = null; - this.initialized = false; - this.unmappedRemaining = unmappedReadCount; - } - - /** - * create the fake iterator, given the mapping of chromosomes and read counts - * - * @param startingChr the starting chromosome - * @param endingChr the ending chromosome - * @param readCount the number of reads in each chromosome - * @param header the associated header - */ - ArtificialSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) { - sChr = startingChr; - eChromosomeCount = (endingChr - startingChr) + 1; - rCount = readCount; - this.header = header; - this.currentChromo = 0; - this.unmappedReadCount = unmappedReadCount; - reset(); - } - - public void close() { - open = false; - } - - public boolean hasNext() { - open = true; - - if (!initialized){ - initialized = true; - createNextRead(); - } - if (this.next != null) { - return true; - } - return false; - } - - protected boolean createNextRead() { - if (currentRead > rCount) { - currentChromo++; - currentRead = 1; - } - // check for end condition, have we finished the chromosome listing, and have no unmapped reads - if (currentChromo >= eChromosomeCount) { - if (unmappedRemaining < 1) { - this.next = null; - return false; - } else { - ++totalReadCount; - this.next = ArtificialSAMUtils.createArtificialRead(this.header, - String.valueOf(totalReadCount), - SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, - 50); - --unmappedRemaining; - return true; - } - } - ++totalReadCount; - this.next = ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), currentChromo, currentRead, 50); - ++currentRead; - return true; - } - - - public SAMRecord next() { - open = true; - - SAMRecord ret = next; - createNextRead(); - return ret; - } - - public void remove() { - throw new UnsupportedOperationException("You've tried to remove on a GATKSAMIterator (unsupported), not to mention that this is a fake iterator."); - } - - /** - * return this iterator, for the iterable interface - * @return - */ - public Iterator iterator() { - return this; - } - - /** - * some instrumentation methods - */ - public int readsTaken() { - return totalReadCount; - } - - /** - * peek at the next sam record - * - * @return - */ - public SAMRecord peek() { - return this.next; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java deleted file mode 100644 index 7fb43efab..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java +++ /dev/null @@ -1,484 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.*; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; - -import java.io.File; -import java.util.*; - -/** - * @author aaron - * @version 1.0 - */ -public class ArtificialSAMUtils { - public static final int DEFAULT_READ_LENGTH = 50; - - /** - * create an artificial sam file - * - * @param filename the filename to write to - * @param numberOfChromosomes the number of chromosomes - * @param startingChromosome where to start counting - * @param chromosomeSize how large each chromosome is - * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) - */ - public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { - SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); - File outFile = new File(filename); - - SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile); - - for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) { - out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH)); - } - } - - out.close(); - } - - /** - * create an artificial sam file - * - * @param filename the filename to write to - * @param numberOfChromosomes the number of chromosomes - * @param startingChromosome where to start counting - * @param chromosomeSize how large each chromosome is - * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) - */ - public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { - SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); - File outFile = new File(filename); - - SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile); - - for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - for (int readNumber = 1; readNumber <= readsPerChomosome; readNumber++) { - out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100)); - } - } - - out.close(); - } - - /** - * Creates an artificial sam header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc - * - * @param numberOfChromosomes the number of chromosomes to create - * @param startingChromosome the starting number for the chromosome (most likely set to 1) - * @param chromosomeSize the length of each chromosome - * @return - */ - public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { - SAMFileHeader header = new SAMFileHeader(); - header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate); - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - // make up some sequence records - for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { - SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); - rec.setSequenceLength(chromosomeSize); - dict.addSequence(rec); - } - header.setSequenceDictionary(dict); - return header; - } - - /** - * Creates an artificial sam header based on the sequence dictionary dict - * - * @return a new sam header - */ - public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) { - SAMFileHeader header = new SAMFileHeader(); - header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate); - header.setSequenceDictionary(dict); - return header; - } - - /** - * Creates an artificial sam header with standard test parameters - * - * @return the sam header - */ - public static SAMFileHeader createArtificialSamHeader() { - return createArtificialSamHeader(1, 1, 1000000); - } - - /** - * setup a default read group for a SAMFileHeader - * - * @param header the header to set - * @param readGroupID the read group ID tag - * @param sampleName the sample name - * @return the adjusted SAMFileHeader - */ - public static SAMFileHeader createDefaultReadGroup(SAMFileHeader header, String readGroupID, String sampleName) { - SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID); - rec.setSample(sampleName); - List readGroups = new ArrayList(); - readGroups.add(rec); - header.setReadGroups(readGroups); - return header; - } - - /** - * setup read groups for the specified read groups and sample names - * - * @param header the header to set - * @param readGroupIDs the read group ID tags - * @param sampleNames the sample names - * @return the adjusted SAMFileHeader - */ - public static SAMFileHeader createEnumeratedReadGroups(SAMFileHeader header, List readGroupIDs, List sampleNames) { - if (readGroupIDs.size() != sampleNames.size()) { - throw new ReviewedGATKException("read group count and sample name count must be the same"); - } - - List readGroups = new ArrayList(); - - int x = 0; - for (; x < readGroupIDs.size(); x++) { - SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupIDs.get(x)); - rec.setSample(sampleNames.get(x)); - readGroups.add(rec); - } - header.setReadGroups(readGroups); - return header; - } - - - /** - * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param length the length of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { - if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || - (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START)) - throw new ReviewedGATKException("Invalid alignment start for artificial read, start = " + alignmentStart); - GATKSAMRecord record = new GATKSAMRecord(header); - record.setReadName(name); - record.setReferenceIndex(refIndex); - record.setAlignmentStart(alignmentStart); - List elements = new ArrayList(); - elements.add(new CigarElement(length, CigarOperator.characterToEnum('M'))); - record.setCigar(new Cigar(elements)); - record.setProperPairFlag(false); - - // our reads and quals are all 'A's by default - byte[] c = new byte[length]; - byte[] q = new byte[length]; - for (int x = 0; x < length; x++) - c[x] = q[x] = 'A'; - record.setReadBases(c); - record.setBaseQualities(q); - - if (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { - record.setReadUnmappedFlag(true); - } - - return record; - } - - /** - * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param bases the sequence of the read - * @param qual the qualities of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { - if (bases.length != qual.length) { - throw new ReviewedGATKException("Passed in read string is different length then the quality array"); - } - GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); - rec.setReadBases(bases); - rec.setBaseQualities(qual); - rec.setReadGroup(new GATKSAMReadGroupRecord("x")); - if (refIndex == -1) { - rec.setReadUnmappedFlag(true); - } - - return rec; - } - - /** - * Create an artificial read based on the parameters - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param bases the sequence of the read - * @param qual the qualities of the read - * @param cigar the cigar string of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { - GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); - rec.setCigarString(cigar); - return rec; - } - - /** - * Create an artificial read with the following default parameters : - * header: - * numberOfChromosomes = 1 - * startingChromosome = 1 - * chromosomeSize = 1000000 - * read: - * name = "default_read" - * refIndex = 0 - * alignmentStart = 1 - * - * @param bases the sequence of the read - * @param qual the qualities of the read - * @param cigar the cigar string of the read - * @return the artificial read - */ - public static GATKSAMRecord createArtificialRead(byte[] bases, byte[] qual, String cigar) { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); - return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); - } - - public static GATKSAMRecord createArtificialRead(Cigar cigar) { - int length = cigar.getReadLength(); - byte [] base = {'A'}; - byte [] qual = {30}; - byte [] bases = Utils.arrayFromArrayWithLength(base, length); - byte [] quals = Utils.arrayFromArrayWithLength(qual, length); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); - return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString()); - } - - - public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { - GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); - GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); - - left.setReadPairedFlag(true); - right.setReadPairedFlag(true); - - left.setProperPairFlag(true); - right.setProperPairFlag(true); - - left.setFirstOfPairFlag(leftIsFirst); - right.setFirstOfPairFlag(!leftIsFirst); - - left.setReadNegativeStrandFlag(leftIsNegative); - left.setMateNegativeStrandFlag(!leftIsNegative); - right.setReadNegativeStrandFlag(!leftIsNegative); - right.setMateNegativeStrandFlag(leftIsNegative); - - left.setMateAlignmentStart(right.getAlignmentStart()); - right.setMateAlignmentStart(left.getAlignmentStart()); - - left.setMateReferenceIndex(0); - right.setMateReferenceIndex(0); - - int isize = rightStart + readLen - leftStart; - left.setInferredInsertSize(isize); - right.setInferredInsertSize(-isize); - - return Arrays.asList(left, right); - } - - /** - * Create a collection of identical artificial reads based on the parameters. The cigar string for each - * read will be *M, where * is the length of the read. - * - * Useful for testing things like positional downsampling where you care only about the position and - * number of reads, and not the other attributes. - * - * @param stackSize number of identical reads to create - * @param header the SAM header to associate each read with - * @param name name associated with each read - * @param refIndex the reference index, i.e. what chromosome to associate them with - * @param alignmentStart where to start each alignment - * @param length the length of each read - * - * @return a collection of stackSize reads all sharing the above properties - */ - public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - Collection stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; - } - - /** - * create an iterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @return GATKSAMIterator representing the specified amount of fake data - */ - public static GATKSAMIterator mappedReadIterator(int startingChr, int endingChr, int readCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); - } - - /** - * create an iterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * @return GATKSAMIterator representing the specified amount of fake data - */ - public static GATKSAMIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); - } - - /** - * create an ArtificialSAMQueryIterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @return GATKSAMIterator representing the specified amount of fake data - */ - public static ArtificialSAMQueryIterator queryReadIterator(int startingChr, int endingChr, int readCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); - } - - /** - * create an ArtificialSAMQueryIterator containing the specified read piles - * - * @param startingChr the chromosome (reference ID) to start from - * @param endingChr the id to end with - * @param readCount the number of reads per chromosome - * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file - * @return GATKSAMIterator representing the specified amount of fake data - */ - public static GATKSAMIterator queryReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { - SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - - return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); - } - - /** - * Create an iterator containing the specified reads - * - * @param reads the reads - * @return iterator for the reads - */ - public static GATKSAMIterator createReadIterator(SAMRecord... reads) { - return createReadIterator(Arrays.asList(reads)); - } - - /** - * Create an iterator containing the specified reads - * - * @param reads the reads - * @return iterator for the reads - */ - public static GATKSAMIterator createReadIterator(List reads) { - final Iterator iter = reads.iterator(); - return new GATKSAMIterator() { - @Override public void close() {} - @Override public Iterator iterator() { return iter; } - @Override public boolean hasNext() { return iter.hasNext(); } - @Override public SAMRecord next() { return iter.next(); } - @Override public void remove() { iter.remove(); } - }; - } - - private final static int ranIntInclusive(Random ran, int start, int stop) { - final int range = stop - start; - return ran.nextInt(range) + start; - } - - /** - * Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with - * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert - * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second - * may be, depending on where this sampled insertSize puts it. - * - * @param header - * @param loc - * @param readLen - * @param insertSize - * @param pileupSize - * @return - */ - public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) { - final Random ran = new Random(); - final boolean leftIsFirst = true; - final boolean leftIsNegative = false; - final int insertSizeVariation = insertSize / 10; - final int pos = loc.getStart(); - - final List pileupElements = new ArrayList(); - for (int i = 0; i < pileupSize / 2; i++) { - final String readName = "read" + i; - final int leftStart = ranIntInclusive(ran, 1, pos); - final int fragmentSize = (int) (ran.nextGaussian() * insertSizeVariation + insertSize); - final int rightStart = leftStart + fragmentSize - readLen; - - if (rightStart <= 0) continue; - - List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); - final GATKSAMRecord left = pair.get(0); - final GATKSAMRecord right = pair.get(1); - - pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(left, pos - leftStart)); - - if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { - pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(right, pos - rightStart)); - } - } - - Collections.sort(pileupElements); - return new ReadBackedPileupImpl(loc, pileupElements); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStream.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStream.java deleted file mode 100644 index 27e25d39c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStream.java +++ /dev/null @@ -1,213 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIteratorAdapter; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; - -/** - * An artificial stream of reads from a single read group/sample with configurable characteristics - * such as: - * - * -the number of contigs that the reads should be distributed across - * -number of "stacks" of reads sharing the same alignment start position per contig - * -the min/max number of reads in each stack (exact values chosen randomly from this range) - * -the min/max distance between stack start positions (exact values chosen randomly from this range) - * -the min/max length of each read (exact values chosen randomly from this range) - * -the number of unmapped reads - * - * The cigar string for all reads will be *M, where * is the length of the read. - * - * @author David Roazen - */ -public class ArtificialSingleSampleReadStream implements Iterable { - private SAMFileHeader header; - private String readGroupID; - private int numContigs; - private int numStacksPerContig; - private int minReadsPerStack; - private int maxReadsPerStack; - private int minDistanceBetweenStacks; - private int maxDistanceBetweenStacks; - private int minReadLength; - private int maxReadLength; - private int numUnmappedReads; - - private static final String READ_GROUP_TAG = "RG"; - - public ArtificialSingleSampleReadStream( SAMFileHeader header, - String readGroupID, - int numContigs, - int numStacksPerContig, - int minReadsPerStack, - int maxReadsPerStack, - int minDistanceBetweenStacks, - int maxDistanceBetweenStacks, - int minReadLength, - int maxReadLength, - int numUnmappedReads ) { - this.header = header; - this.readGroupID = readGroupID; - this.numContigs = numContigs; - this.numStacksPerContig = numStacksPerContig; - this.minReadsPerStack = minReadsPerStack; - this.maxReadsPerStack = maxReadsPerStack; - this.minDistanceBetweenStacks = minDistanceBetweenStacks; - this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; - this.minReadLength = minReadLength; - this.maxReadLength = maxReadLength; - this.numUnmappedReads = numUnmappedReads; - - validateStreamParameters(); - } - - private void validateStreamParameters() { - if ( header == null || readGroupID == null ) { - throw new ReviewedGATKException("null SAMFileHeader or read group ID") ; - } - - if ( header.getReadGroup(readGroupID) == null ) { - throw new ReviewedGATKException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); - } - - if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || - minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || - numUnmappedReads < 0 ) { - throw new ReviewedGATKException("Read stream parameters must be >= 0"); - } - - if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { - throw new ReviewedGATKException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); - } - - if ( minReadsPerStack > maxReadsPerStack ) { - throw new ReviewedGATKException("minReadsPerStack > maxReadsPerStack"); - } - - if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { - throw new ReviewedGATKException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); - } - - if ( minReadLength > maxReadLength ) { - throw new ReviewedGATKException("minReadLength > maxReadLength"); - } - } - - public Iterator iterator() { - return makeReads().iterator(); - } - - public GATKSAMIterator getGATKSAMIterator() { - return GATKSAMIteratorAdapter.adapt(iterator()); - } - - public Collection makeReads() { - Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); - - for ( int contig = 0; contig < numContigs; contig++ ) { - int alignmentStart = 1; - - for ( int stack = 0; stack < numStacksPerContig; stack++ ) { - reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); - alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); - } - } - - if ( numUnmappedReads > 0 ) { - reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); - } - - return reads; - } - - private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { - Collection readStack = new ArrayList(stackSize); - - for ( int i = 0; i < stackSize; i++ ) { - SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, - "foo", - contig, - alignmentStart, - MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); - read.setAttribute(READ_GROUP_TAG, readGroupID); - readStack.add(read); - } - - return readStack; - } - - public SAMFileHeader getHeader() { - return header; - } - - public String getReadGroupID() { - return readGroupID; - } - - public int getNumContigs() { - return numContigs; - } - - public int getNumStacksPerContig() { - return numStacksPerContig; - } - - public int getMinReadsPerStack() { - return minReadsPerStack; - } - - public int getMaxReadsPerStack() { - return maxReadsPerStack; - } - - public int getMinDistanceBetweenStacks() { - return minDistanceBetweenStacks; - } - - public int getMaxDistanceBetweenStacks() { - return maxDistanceBetweenStacks; - } - - public int getMinReadLength() { - return minReadLength; - } - - public int getMaxReadLength() { - return maxReadLength; - } - - public int getNumUnmappedReads() { - return numUnmappedReads; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/BySampleSAMFileWriter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/BySampleSAMFileWriter.java deleted file mode 100644 index e212bd909..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/BySampleSAMFileWriter.java +++ /dev/null @@ -1,70 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMProgramRecord; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.util.HashMap; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: carneiro - * Date: Nov 13 - */ -public class BySampleSAMFileWriter extends NWaySAMFileWriter{ - - private final Map sampleToWriterMap; - - public BySampleSAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - super(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, pRecord, keep_records); - - sampleToWriterMap = new HashMap(toolkit.getSAMFileHeader().getReadGroups().size() * 2); - - for (SAMReaderID readerID : toolkit.getReadsDataSource().getReaderIDs()) { - for (SAMReadGroupRecord rg : toolkit.getReadsDataSource().getHeader(readerID).getReadGroups()) { - String sample = rg.getSample(); - if (sampleToWriterMap.containsKey(sample) && sampleToWriterMap.get(sample) != readerID) { - throw new ReviewedGATKException("The same sample appears in multiple files, this input cannot be multiplexed using the BySampleSAMFileWriter, try NWaySAMFileWriter instead."); - } - else { - sampleToWriterMap.put(sample, readerID); - } - } - } - } - - @Override - public void addAlignment(SAMRecord samRecord) { - super.addAlignment(samRecord, sampleToWriterMap.get(samRecord.getReadGroup().getSample())); - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/MisencodedBaseQualityReadTransformer.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/MisencodedBaseQualityReadTransformer.java deleted file mode 100644 index 35146f0b8..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/MisencodedBaseQualityReadTransformer.java +++ /dev/null @@ -1,94 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -/** - * Checks for and errors out (or fixes if requested) when it detects reads with base qualities that are not encoded with - * phred-scaled quality scores. Q0 == ASCII 33 according to the SAM specification, whereas Illumina encoding starts at - * Q64. The idea here is simple: if we are asked to fix the scores then we just subtract 31 from every quality score. - * Otherwise, we randomly sample reads (for efficiency) and error out if we encounter a qual that's too high. - */ -public class MisencodedBaseQualityReadTransformer extends ReadTransformer { - - private static final int samplingFrequency = 1000; // sample 1 read for every 1000 encountered - private static final int encodingFixValue = 31; // Illumina_64 - PHRED_33 - - private boolean disabled; - private boolean fixQuals; - protected static int currentReadCounter = 0; - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { - fixQuals = engine.getArguments().FIX_MISENCODED_QUALS; - disabled = !fixQuals && engine.getArguments().ALLOW_POTENTIALLY_MISENCODED_QUALS; - - return ReadTransformer.ApplicationTime.ON_INPUT; - } - - @Override - public boolean enabled() { - return !disabled; - } - - @Override - public GATKSAMRecord apply(final GATKSAMRecord read) { - if ( fixQuals ) - return fixMisencodedQuals(read); - - checkForMisencodedQuals(read); - return read; - } - - protected static GATKSAMRecord fixMisencodedQuals(final GATKSAMRecord read) { - final byte[] quals = read.getBaseQualities(); - for ( int i = 0; i < quals.length; i++ ) { - quals[i] -= encodingFixValue; - if ( quals[i] < 0 ) - throw new UserException.BadInput("while fixing mis-encoded base qualities we encountered a read that was correctly encoded; we cannot handle such a mixture of reads so unfortunately the BAM must be fixed with some other tool"); - } - read.setBaseQualities(quals); - return read; - } - - protected static void checkForMisencodedQuals(final GATKSAMRecord read) { - // sample reads randomly for checking - if ( ++currentReadCounter >= samplingFrequency ) { - currentReadCounter = 0; - - final byte[] quals = read.getBaseQualities(); - for ( final byte qual : quals ) { - if ( qual > QualityUtils.MAX_REASONABLE_Q_SCORE ) - throw new UserException.MisencodedBAM(read, "we encountered an extremely high quality score of " + (int)qual); - } - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/NWaySAMFileWriter.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/NWaySAMFileWriter.java deleted file mode 100644 index abf70d5a4..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/NWaySAMFileWriter.java +++ /dev/null @@ -1,185 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.*; -import htsjdk.samtools.util.ProgressLoggerInterface; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: May 31, 2011 - * Time: 3:52:49 PM - * To change this template use File | Settings | File Templates. - */ -public class NWaySAMFileWriter implements SAMFileWriter { - - private Map writerMap = null; - private boolean presorted ; - GenomeAnalysisEngine toolkit; - boolean KEEP_ALL_PG_RECORDS = false; - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { - this.presorted = presorted; - this.toolkit = toolkit; - this.KEEP_ALL_PG_RECORDS = keep_records; - writerMap = new HashMap(); - setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5) { - this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly , boolean generateMD5) { - this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The in2out map must contain an entry for each input filename and map it - * onto a unique output file name. - * @param toolkit - * @param in2out - */ - public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - if ( in2out==null ) throw new GATKException("input-output bam filename map for n-way-out writing is NULL"); - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - if ( ! in2out.containsKey(fName) ) - throw new UserException.BadInput("Input-output bam filename map does not contain an entry for the input file "+fName); - outName = in2out.get(fName); - - if ( writerMap.containsKey( rid ) ) - throw new GATKException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ - "map file likely contains multiple entries for this input file"); - - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - /** - * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved - * from toolkit). The output file names will be generated automatically by stripping ".sam" or ".bam" off the - * input file name and adding ext instead (e.g. ".cleaned.bam"). - * onto a unique output file name. - * @param toolkit - * @param ext - */ - public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, - boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { - for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { - - String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); - - String outName; - int pos ; - if ( fName.toUpperCase().endsWith(".BAM") ) pos = fName.toUpperCase().lastIndexOf(".BAM"); - else { - if ( fName.toUpperCase().endsWith(".SAM") ) pos = fName.toUpperCase().lastIndexOf(".SAM"); - else throw new UserException.BadInput("Input file name "+fName+" does not end with .sam or .bam"); - } - String prefix = fName.substring(0,pos); - outName = prefix+ext; - - if ( writerMap.containsKey( rid ) ) - throw new GATKException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); - } - - } - - private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, - boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { - File f = new File(outName); - SAMFileHeader header = Utils.setupWriter(toolkit.getSAMFileHeader(id), programRecord); - SAMFileWriterFactory factory = new SAMFileWriterFactory(); - factory.setCreateIndex(indexOnTheFly); - factory.setCreateMd5File(generateMD5); - SAMFileWriter sw = factory.makeSAMOrBAMWriter(header, presorted, f); - writerMap.put(id,sw); - } - - public Collection getWriters() { - return writerMap.values(); - } - - public void addAlignment(SAMRecord samRecord) { - final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); - String rg = samRecord.getStringAttribute("RG"); - if ( rg != null ) { - String rg_orig = toolkit.getReadsDataSource().getOriginalReadGroupId(rg); - samRecord.setAttribute("RG",rg_orig); - } - addAlignment(samRecord, id); - } - - public void addAlignment(SAMRecord samRecord, SAMReaderID readerID) { - writerMap.get(readerID).addAlignment(samRecord); - } - - public SAMFileHeader getFileHeader() { - return toolkit.getSAMFileHeader(); - } - - public void close() { - for ( SAMFileWriter w : writerMap.values() ) w.close(); - } - - @Override - public void setProgressLogger(final ProgressLoggerInterface logger) { - for (final SAMFileWriter writer: writerMap.values()) { - writer.setProgressLogger(logger); - } - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java deleted file mode 100644 index 7fc1b40f7..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java +++ /dev/null @@ -1,964 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.samtools.*; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.io.stubs.SAMFileWriterStub; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.File; -import java.util.*; - -/** - * A miscellaneous collection of utilities for working with SAM files, headers, etc. - * Static methods only, please. - * - * @author mhanna - * @version 0.1 - */ -public class ReadUtils { - private final static Logger logger = Logger.getLogger(ReadUtils.class); - - private static final String OFFSET_OUT_OF_BOUNDS_EXCEPTION = "Offset cannot be greater than read length %d : %d"; - private static final String OFFSET_NOT_ZERO_EXCEPTION = "We ran past the end of the read and never found the offset, something went wrong!"; - - private ReadUtils() { - } - - private static final int DEFAULT_ADAPTOR_SIZE = 100; - public static final int CLIPPING_GOAL_NOT_REACHED = -1; - - /** - * A marker to tell which end of the read has been clipped - */ - public enum ClippingTail { - LEFT_TAIL, - RIGHT_TAIL - } - - /** - * A HashMap of the SAM spec read flag names - * - * Note: This is not being used right now, but can be useful in the future - */ - private static final Map readFlagNames = new HashMap(); - - static { - readFlagNames.put(0x1, "Paired"); - readFlagNames.put(0x2, "Proper"); - readFlagNames.put(0x4, "Unmapped"); - readFlagNames.put(0x8, "MateUnmapped"); - readFlagNames.put(0x10, "Forward"); - //readFlagNames.put(0x20, "MateForward"); - readFlagNames.put(0x40, "FirstOfPair"); - readFlagNames.put(0x80, "SecondOfPair"); - readFlagNames.put(0x100, "NotPrimary"); - readFlagNames.put(0x200, "NON-PF"); - readFlagNames.put(0x400, "Duplicate"); - } - - /** - * This enum represents all the different ways in which a read can overlap an interval. - * - * NO_OVERLAP_CONTIG: - * read and interval are in different contigs. - * - * NO_OVERLAP_LEFT: - * the read does not overlap the interval. - * - * |----------------| (interval) - * <----------------> (read) - * - * NO_OVERLAP_RIGHT: - * the read does not overlap the interval. - * - * |----------------| (interval) - * <----------------> (read) - * - * OVERLAP_LEFT: - * the read starts before the beginning of the interval but ends inside of it - * - * |----------------| (interval) - * <----------------> (read) - * - * OVERLAP_RIGHT: - * the read starts inside the interval but ends outside of it - * - * |----------------| (interval) - * <----------------> (read) - * - * OVERLAP_LEFT_AND_RIGHT: - * the read starts before the interval and ends after the interval - * - * |-----------| (interval) - * <-------------------> (read) - * - * OVERLAP_CONTAINED: - * the read starts and ends inside the interval - * - * |----------------| (interval) - * <--------> (read) - */ - public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} - - /** - * Creates a SAMFileWriter using all of the features currently set in the engine (command line arguments, ReadTransformers, etc) - * @param file the filename to write to - * @param engine the engine - * @return a SAMFileWriter with the correct options set - */ - public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine) { - final SAMFileWriterStub output = new SAMFileWriterStub(engine, new File(file)); - output.processArguments(engine.getArguments()); - return output; - } - - /** - * As {@link #createSAMFileWriter(String, org.broadinstitute.gatk.engine.GenomeAnalysisEngine)}, but also sets the header - */ - public static SAMFileWriter createSAMFileWriter(final String file, final GenomeAnalysisEngine engine, final SAMFileHeader header) { - final SAMFileWriterStub output = (SAMFileWriterStub) createSAMFileWriter(file, engine); - output.writeHeader(header); - return output; - } - - /** - * is this base inside the adaptor of the read? - * - * There are two cases to treat here: - * - * 1) Read is in the negative strand => Adaptor boundary is on the left tail - * 2) Read is in the positive strand => Adaptor boundary is on the right tail - * - * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) - * - * @param read the read to test - * @param basePos base position in REFERENCE coordinates (not read coordinates) - * @return whether or not the base is in the adaptor - */ - public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { - final int adaptorBoundary = read.getAdaptorBoundary(); - if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) - return false; - - return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; - } - - /** - * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to - * the read boundary. If the read is in the positive strand, this is the first base after the end of the - * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the - * beginning of the fragment. - * - * There are two cases we need to treat here: - * - * 1) Our read is in the reverse strand : - * - * <----------------------| * - * |---------------------> - * - * in these cases, the adaptor boundary is at the mate start (minus one) - * - * 2) Our read is in the forward strand : - * - * |----------------------> * - * <----------------------| - * - * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) - * - * @param read the read being tested for the adaptor boundary - * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. - * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. - */ - public static int getAdaptorBoundary(final SAMRecord read) { - if ( ! hasWellDefinedFragmentSize(read) ) { - return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; - } else if ( read.getReadNegativeStrandFlag() ) { - return read.getMateAlignmentStart() - 1; // case 1 (see header) - } else { - final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) - return read.getAlignmentStart() + insertSize + 1; // case 2 (see header) - } - } - - public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; - - /** - * Can the adaptor sequence of read be reliably removed from the read based on the alignment of - * read and its mate? - * - * @param read the read to check - * @return true if it can, false otherwise - */ - public static boolean hasWellDefinedFragmentSize(final SAMRecord read) { - if ( read.getInferredInsertSize() == 0 ) - // no adaptors in reads with mates in another chromosome or unmapped pairs - return false; - if ( ! read.getReadPairedFlag() ) - // only reads that are paired can be adaptor trimmed - return false; - if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() ) - // only reads when both reads are mapped can be trimmed - return false; -// if ( ! read.getProperPairFlag() ) -// // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs -// // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed -// return false; - if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) - // sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand - return false; - - if ( read.getReadNegativeStrandFlag() ) { - // we're on the negative strand, so our read runs right to left - return read.getAlignmentEnd() > read.getMateAlignmentStart(); - } else { - // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start) - return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize(); - } - } - - /** - * is the read a 454 read? - * - * @param read the read to test - * @return checks the read group tag PL for the default 454 tag - */ - public static boolean is454Read(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.LS454; - } - - /** - * is the read an IonTorrent read? - * - * @param read the read to test - * @return checks the read group tag PL for the default ion tag - */ - public static boolean isIonRead(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.ION_TORRENT; - } - - /** - * is the read a SOLiD read? - * - * @param read the read to test - * @return checks the read group tag PL for the default SOLiD tag - */ - public static boolean isSOLiDRead(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.SOLID; - } - - /** - * is the read a SLX read? - * - * @param read the read to test - * @return checks the read group tag PL for the default SLX tag - */ - public static boolean isIlluminaRead(GATKSAMRecord read) { - return NGSPlatform.fromRead(read) == NGSPlatform.ILLUMINA; - } - - /** - * checks if the read has a platform tag in the readgroup equal to 'name'. - * Assumes that 'name' is upper-cased. - * - * @param read the read to test - * @param name the upper-cased platform name to test - * @return whether or not name == PL tag in the read group of read - */ - public static boolean isPlatformRead(GATKSAMRecord read, String name) { - - SAMReadGroupRecord readGroup = read.getReadGroup(); - if (readGroup != null) { - Object readPlatformAttr = readGroup.getAttribute("PL"); - if (readPlatformAttr != null) - return readPlatformAttr.toString().toUpperCase().contains(name); - } - return false; - } - - - /** - * Returns the collections of reads sorted in coordinate order, according to the order defined - * in the reads themselves - * - * @param reads - * @return - */ - public final static List sortReadsByCoordinate(List reads) { - final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); - Collections.sort(reads, comparer); - return reads; - } - - /** - * If a read starts in INSERTION, returns the first element length. - * - * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. - * - * @param read - * @return the length of the first insertion, or 0 if there is none (see warning). - */ - public final static int getFirstInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(0); - if ( e.getOperator() == CigarOperator.I ) - return e.getLength(); - else - return 0; - } - - /** - * If a read ends in INSERTION, returns the last element length. - * - * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. - * - * @param read - * @return the length of the last insertion, or 0 if there is none (see warning). - */ - public final static int getLastInsertionOffset(SAMRecord read) { - CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); - if ( e.getOperator() == CigarOperator.I ) - return e.getLength(); - else - return 0; - } - - /** - * Determines what is the position of the read in relation to the interval. - * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. - * @param read the read - * @param interval the interval - * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) - */ - public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { - - int sStart = read.getSoftStart(); - int sStop = read.getSoftEnd(); - int uStart = read.getUnclippedStart(); - int uStop = read.getUnclippedEnd(); - - if ( !read.getReferenceName().equals(interval.getContig()) ) - return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - - else if ( uStop < interval.getStart() ) - return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - - else if ( uStart > interval.getStop() ) - return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - - else if ( sStop < interval.getStart() ) - return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; - - else if ( sStart > interval.getStop() ) - return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; - - else if ( (sStart >= interval.getStart()) && - (sStop <= interval.getStop()) ) - return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - - else if ( (sStart < interval.getStart()) && - (sStop > interval.getStop()) ) - return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - - else if ( (sStart < interval.getStart()) ) - return ReadAndIntervalOverlap.OVERLAP_LEFT; - - else - return ReadAndIntervalOverlap.OVERLAP_RIGHT; - } - - /** - * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of - * two corner cases: - * - * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside - * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it - * doesn't matter because it already returns the previous base by default. - * - * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the - * read starts with an insertion, and you're requesting the first read based coordinate, it will skip - * the leading insertion (because it has the same reference coordinate as the following base). - * - * @param read - * @param refCoord - * @param tail - * @return the read coordinate corresponding to the requested reference coordinate for clipping. - */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) - @Ensures({"result >= 0", "result < read.getReadLength()"}) - public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { - return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); - } - - public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) { - final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord); - return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false); - } - - public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { - Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); - int readCoord = result.getFirst(); - - // Corner case one: clipping the right tail and falls on deletion, move to the next - // read coordinate. It is not a problem for the left tail because the default answer - // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. - if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) - readCoord++; - - // clipping the left tail and first base is insertion, go to the next read coordinate - // with the same reference coordinate. Advance to the next cigar element, or to the - // end of the read if there is no next element. - final CigarElement firstElementIsInsertion = readStartsWithInsertion(cigar); - if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion != null) - readCoord = Math.min(firstElementIsInsertion.getLength(), cigar.getReadLength() - 1); - - return readCoord; - } - - /** - * Returns the read coordinate corresponding to the requested reference coordinate. - * - * WARNING: if the requested reference coordinate happens to fall inside or just before a deletion (or skipped region) in the read, this function - * will return the last read base before the deletion (or skipped region). This function returns a - * Pair(int readCoord, boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion) so you can choose which readCoordinate to use when faced with - * a deletion (or skipped region). - * - * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a - * pre-processed result according to normal clipping needs. Or you can use this function and tailor the - * behavior to your needs. - * - * @param read - * @param refCoord the requested reference coordinate - * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) - */ - @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) - @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) - //TODO since we do not have contracts any more, should we check for the requirements in the method code? - public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { - return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false); - } - - public static Pair getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { - int readBases = 0; - int refBases = 0; - boolean fallsInsideDeletionOrSkippedRegion = false; - boolean endJustBeforeDeletionOrSkippedRegion = false; - boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false; - - final int goal = refCoord - alignmentStart; // The goal is to move this many reference bases - if (goal < 0) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedGATKException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - } - } - boolean goalReached = refBases == goal; - - Iterator cigarElementIterator = cigar.getCigarElements().iterator(); - while (!goalReached && cigarElementIterator.hasNext()) { - final CigarElement cigarElement = cigarElementIterator.next(); - int shift = 0; - - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { - if (refBases + cigarElement.getLength() < goal) - shift = cigarElement.getLength(); - else - shift = goal - refBases; - - refBases += shift; - } - goalReached = refBases == goal; - - if (!goalReached && cigarElement.getOperator().consumesReadBases()) - readBases += cigarElement.getLength(); - - if (goalReached) { - // Is this base's reference position within this cigar element? Or did we use it all? - final boolean endsWithinCigar = shift < cigarElement.getLength(); - - // If it isn't, we need to check the next one. There should *ALWAYS* be a next one - // since we checked if the goal coordinate is within the read length, so this is just a sanity check. - if (!endsWithinCigar && !cigarElementIterator.hasNext()) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedGATKException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); - } - } - - CigarElement nextCigarElement = null; - - // if we end inside the current cigar element, we just have to check if it is a deletion (or skipped region) - if (endsWithinCigar) - fallsInsideDeletionOrSkippedRegion = (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) ; - - // if we end outside the current cigar element, we need to check if the next element is an insertion, deletion or skipped region. - else { - nextCigarElement = cigarElementIterator.next(); - - // if it's an insertion, we need to clip the whole insertion before looking at the next element - if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { - readBases += nextCigarElement.getLength(); - if (!cigarElementIterator.hasNext()) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedGATKException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); - } - } - - nextCigarElement = cigarElementIterator.next(); - } - - // if it's a deletion (or skipped region), we will pass the information on to be handled downstream. - endJustBeforeDeletionOrSkippedRegion = (nextCigarElement.getOperator() == CigarOperator.DELETION || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION); - } - - fallsInsideOrJustBeforeDeletionOrSkippedRegion = endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion; - - // If we reached our goal outside a deletion (or skipped region), add the shift - if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) - readBases += shift; - - // If we reached our goal just before a deletion (or skipped region) we need - // to add the shift of the current cigar element but go back to it's last element to return the last - // base before the deletion (or skipped region) (see warning in function contracts) - else if (endJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) - readBases += shift - 1; - - // If we reached our goal inside a deletion (or skipped region), or just between a deletion and a skipped region, - // then we must backtrack to the last base before the deletion (or skipped region) - else if (fallsInsideDeletionOrSkippedRegion || - (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.N)) || - (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.D))) - readBases--; - } - } - - if (!goalReached) { - if (allowGoalNotReached) { - return new Pair(CLIPPING_GOAL_NOT_REACHED, false); - } else { - throw new ReviewedGATKException("Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); - } - } - - return new Pair(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion); - } - - /** - * Compares two SAMRecords only the basis on alignment start. Note that - * comparisons are performed ONLY on the basis of alignment start; any - * two SAM records with the same alignment start will be considered equal. - * - * Unmapped alignments will all be considered equal. - */ - - @Requires({"read1 != null", "read2 != null"}) - public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { - AlignmentStartComparator comp = new AlignmentStartComparator(); - return comp.compare(read1, read2); - } - - /** - * Is a base inside a read? - * - * @param read the read to evaluate - * @param referenceCoordinate the reference coordinate of the base to test - * @return true if it is inside the read, false otherwise. - */ - public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { - return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); - } - - /** - * Is this read all insertion? - * - * @param read - * @return whether or not the only element in the cigar string is an Insertion - */ - public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - if (cigarElement.getOperator() != CigarOperator.INSERTION) - return false; - } - return true; - } - - /** - * @see #readStartsWithInsertion(htsjdk.samtools.Cigar, boolean) with ignoreClipOps set to true - */ - public static CigarElement readStartsWithInsertion(final Cigar cigarForRead) { - return readStartsWithInsertion(cigarForRead, true); - } - - /** - * Checks if a read starts with an insertion. - * - * @param cigarForRead the CIGAR to evaluate - * @param ignoreSoftClipOps should we ignore S operators when evaluating whether an I operator is at the beginning? Note that H operators are always ignored. - * @return the element if it's a leading insertion or null otherwise - */ - public static CigarElement readStartsWithInsertion(final Cigar cigarForRead, final boolean ignoreSoftClipOps) { - for ( final CigarElement cigarElement : cigarForRead.getCigarElements() ) { - if ( cigarElement.getOperator() == CigarOperator.INSERTION ) - return cigarElement; - - else if ( cigarElement.getOperator() != CigarOperator.HARD_CLIP && ( !ignoreSoftClipOps || cigarElement.getOperator() != CigarOperator.SOFT_CLIP) ) - break; - } - return null; - } - - /** - * Returns the coverage distribution of a list of reads within the desired region. - * - * See getCoverageDistributionOfRead for information on how the coverage is calculated. - * - * @param list the list of reads covering the region - * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) - * @return an array with the coverage of each position from startLocation to stopLocation - */ - public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { - int [] totalCoverage = new int[stopLocation - startLocation + 1]; - - for (GATKSAMRecord read : list) { - int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); - totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); - } - - return totalCoverage; - } - - /** - * Returns the coverage distribution of a single read within the desired region. - * - * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample - * reads for variant regions, and deletions count as variants) - * - * @param read the read to get the coverage distribution of - * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) - * @return an array with the coverage of each position from startLocation to stopLocation - */ - public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { - int [] coverage = new int[stopLocation - startLocation + 1]; - int refLocation = read.getSoftStart(); - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - switch (cigarElement.getOperator()) { - case S: - case M: - case EQ: - case N: - case X: - case D: - for (int i = 0; i < cigarElement.getLength(); i++) { - if (refLocation >= startLocation && refLocation <= stopLocation) { - coverage[refLocation - startLocation]++; - } - refLocation++; - } - break; - - case P: - case I: - case H: - break; - } - - if (refLocation > stopLocation) - break; - } - return coverage; - } - - /** - * Makes association maps for the reads and loci coverage as described below : - * - * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. - * Note: Locus is in reference coordinates. - * Example: Locus => {read1, read2, ..., readN} - * - * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. - * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. - * Example: Read => {true, true, false, ... false} - * - * @param readList the list of reads to generate the association mappings - * @param startLocation the first reference coordinate of the region (inclusive) - * @param stopLocation the last reference coordinate of the region (inclusive) - * @return the two hashmaps described above - */ - public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { - int arraySize = stopLocation - startLocation + 1; - - HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); - HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); - - for (int i = startLocation; i <= stopLocation; i++) - locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists - - for (GATKSAMRecord read : readList) { - readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays - - int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); - - for (int i = 0; i < readCoverage.length; i++) { - int refLocation = i + startLocation; - if (readCoverage[i] > 0) { - // Update the hash for this locus - HashSet readSet = locusToReadMap.get(refLocation); - readSet.add(read); - - // Add this locus to the read hash - readToLocusMap.get(read)[refLocation - startLocation] = true; - } - else - // Update the boolean array with a 'no coverage' from this read to this locus - readToLocusMap.get(read)[refLocation-startLocation] = false; - } - } - return new Pair>, HashMap>(locusToReadMap, readToLocusMap); - } - - /** - * Create random read qualities - * - * @param length the length of the read - * @return an array with randomized base qualities between 0 and 50 - */ - public static byte[] createRandomReadQuals(int length) { - Random random = GenomeAnalysisEngine.getRandomGenerator(); - byte[] quals = new byte[length]; - for (int i = 0; i < length; i++) - quals[i] = (byte) random.nextInt(50); - return quals; - } - - /** - * Create random read qualities - * - * @param length the length of the read - * @param allowNs whether or not to allow N's in the read - * @return an array with randomized bases (A-N) with equal probability - */ - public static byte[] createRandomReadBases(int length, boolean allowNs) { - Random random = GenomeAnalysisEngine.getRandomGenerator(); - int numberOfBases = allowNs ? 5 : 4; - byte[] bases = new byte[length]; - for (int i = 0; i < length; i++) { - switch (random.nextInt(numberOfBases)) { - case 0: - bases[i] = 'A'; - break; - case 1: - bases[i] = 'C'; - break; - case 2: - bases[i] = 'G'; - break; - case 3: - bases[i] = 'T'; - break; - case 4: - bases[i] = 'N'; - break; - default: - throw new ReviewedGATKException("Something went wrong, this is just impossible"); - } - } - return bases; - } - - public static GATKSAMRecord createRandomRead(int length) { - return createRandomRead(length, true); - } - - public static GATKSAMRecord createRandomRead(int length, boolean allowNs) { - byte[] quals = ReadUtils.createRandomReadQuals(length); - byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs); - return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); - } - - - public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { - String[] sequenceRecordNames = new String[sequenceDictionary.size()]; - int sequenceRecordIndex = 0; - for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) - sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); - return Arrays.deepToString(sequenceRecordNames); - } - - /** - * Calculates the reference coordinate for a read coordinate - * - * @param read the read - * @param offset the base in the read (coordinate in the read) - * @return the reference coordinate correspondent to this base - */ - public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) { - if (offset > read.getReadLength()) - throw new ReviewedGATKException(String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength())); - - long location = read.getAlignmentStart(); - Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); - while (offset > 0 && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - long move = 0; - if (cigarElement.getOperator().consumesReferenceBases()) - move = (long) Math.min(cigarElement.getLength(), offset); - location += move; - offset -= move; - } - if (offset > 0 && !cigarElementIterator.hasNext()) - throw new ReviewedGATKException(OFFSET_NOT_ZERO_EXCEPTION); - - return location; - } - - /** - * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened. - * - * Example: - * D -> 2, 34, 75 - * I -> 55 - * S -> 0, 101 - * H -> 101 - * - * @param read the read - * @return a map with the properties described above. See example - */ - public static Map> getCigarOperatorForAllBases (GATKSAMRecord read) { - Map> events = new HashMap>(); - - int position = 0; - for (CigarElement cigarElement : read.getCigar().getCigarElements()) { - CigarOperator op = cigarElement.getOperator(); - if (op.consumesReadBases()) { - ArrayList list = events.get(op); - if (list == null) { - list = new ArrayList(); - events.put(op, list); - } - for (int i = position; i < cigarElement.getLength(); i++) - list.add(position++); - } - else { - ArrayList list = events.get(op); - if (list == null) { - list = new ArrayList(); - events.put(op, list); - } - list.add(position); - } - } - return events; - } - - /** - * Given a read, outputs the read bases in a string format - * - * @param read the read - * @return a string representation of the read bases - */ - public static String convertReadBasesToString(GATKSAMRecord read) { - String bases = ""; - for (byte b : read.getReadBases()) { - bases += (char) b; - } - return bases.toUpperCase(); - } - - /** - * Given a read, outputs the base qualities in a string format - * - * @param quals the read qualities - * @return a string representation of the base qualities - */ - public static String convertReadQualToString(byte[] quals) { - String result = ""; - for (byte b : quals) { - result += (char) (33 + b); - } - return result; - } - - /** - * Given a read, outputs the base qualities in a string format - * - * @param read the read - * @return a string representation of the base qualities - */ - public static String convertReadQualToString(GATKSAMRecord read) { - return convertReadQualToString(read.getBaseQualities()); - } - - /** - * Returns the reverse complement of the read bases - * - * @param bases the read bases - * @return the reverse complement of the read bases - */ - public static String getBasesReverseComplement(byte[] bases) { - String reverse = ""; - for (int i = bases.length-1; i >=0; i--) { - reverse += (char) BaseUtils.getComplement(bases[i]); - } - return reverse; - } - - /** - * Returns the reverse complement of the read bases - * - * @param read the read - * @return the reverse complement of the read bases - */ - public static String getBasesReverseComplement(GATKSAMRecord read) { - return getBasesReverseComplement(read.getReadBases()); - } - - /** - * Calculate the maximum read length from the given list of reads. - * @param reads list of reads - * @return non-negative integer - */ - @Ensures({"result >= 0"}) - public static int getMaxReadLength( final List reads ) { - if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); } - - int maxReadLength = 0; - for( final GATKSAMRecord read : reads ) { - maxReadLength = Math.max(maxReadLength, read.getReadLength()); - } - return maxReadLength; - } -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java deleted file mode 100644 index d6e1bcbea..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java +++ /dev/null @@ -1,344 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.text; - -import org.broadinstitute.gatk.utils.commandline.ParsingEngine; -import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; -import java.util.regex.Pattern; - -/** - * A collection of convenience methods for working with list files. - */ -public class ListFileUtils { - /** - * Lines starting with this String in .list files are considered comments. - */ - public static final String LIST_FILE_COMMENT_START = "#"; - - /** - * Unpack the bam files to be processed, given a list of files. That list of files can - * itself contain entries which are lists of other files to be read (note: you cannot have lists - * of lists of lists). Lines in .list files containing only whitespace or which begin with - * LIST_FILE_COMMENT_START are ignored. - * - * @param samFiles The sam files, in string format. - * @param parser Parser - * @return a flattened list of the bam files provided - */ - public static List unpackBAMFileList(final List samFiles, final ParsingEngine parser) { - List unpackedReads = new ArrayList(); - for( String inputFileName: samFiles ) { - Tags inputFileNameTags = parser.getTags(inputFileName); - inputFileName = expandFileName(inputFileName); - if (inputFileName.toLowerCase().endsWith(".list") ) { - try { - for ( String fileName : new XReadLines(new File(inputFileName), true, LIST_FILE_COMMENT_START) ) { - unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); - } - } - catch( FileNotFoundException ex ) { - throw new UserException.CouldNotReadInputFile(new File(inputFileName), "Unable to find file while unpacking reads", ex); - } - } - else if(inputFileName.toLowerCase().endsWith(".bam")) { - unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); - } - else if(inputFileName.endsWith("stdin")) { - unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); - } - else { - throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM files with the .bam extension and lists of BAM files " + - "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + - "of BAM files is in the correct format, update the extension, and try again.",inputFileName)); - } - } - return unpackedReads; - } - - /** - * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. - * @param RODBindings a text equivale - * @param parser Parser - * @return a list of expanded, bound RODs. - */ - @Deprecated - @SuppressWarnings("unused") // TODO: Who is still using this? External walkers? - public static Collection unpackRODBindingsOldStyle(final Collection RODBindings, final ParsingEngine parser) { - // todo -- this is a strange home for this code. Move into ROD system - Collection rodBindings = new ArrayList(); - - for (String fileName: RODBindings) { - final Tags tags = parser.getTags(fileName); - fileName = expandFileName(fileName); - - List positionalTags = tags.getPositionalTags(); - if(positionalTags.size() != 2) - throw new UserException("Invalid syntax for -B (reference-ordered data) input flag. " + - "Please use the following syntax when providing reference-ordered " + - "data: -B:, ."); - // Assume that if tags are present, those tags are name and type. - // Name is always first, followed by type. - String name = positionalTags.get(0); - String type = positionalTags.get(1); - - RMDTriplet.RMDStorageType storageType; - if(tags.getValue("storage") != null) - storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage")); - else if(fileName.toLowerCase().endsWith("stdin")) - storageType = RMDTriplet.RMDStorageType.STREAM; - else - storageType = RMDTriplet.RMDStorageType.FILE; - - rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags)); - } - - return rodBindings; - } - - /** - * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. - * @param RODBindings a text equivale - * @param parser Parser - * @return a list of expanded, bound RODs. - */ - @SuppressWarnings("unchecked") - public static Collection unpackRODBindings(final Collection RODBindings, @SuppressWarnings("unused") final ParsingEngine parser) { - // todo -- this is a strange home for this code. Move into ROD system - Collection rodBindings = new ArrayList(); - FeatureManager builderForValidation = new FeatureManager(); - - for (RodBinding rodBinding: RODBindings) { - String argValue = rodBinding.getSource(); - String fileName = expandFileName(argValue); - String name = rodBinding.getName(); - String type = rodBinding.getTribbleType(); - - RMDTriplet.RMDStorageType storageType; - if(rodBinding.getTags().getValue("storage") != null) - storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage")); - else if(fileName.toLowerCase().endsWith("stdin")) - storageType = RMDTriplet.RMDStorageType.STREAM; - else - storageType = RMDTriplet.RMDStorageType.FILE; - - RMDTriplet triplet = new RMDTriplet(name,type,fileName,storageType,rodBinding.getTags()); - - // validate triplet type - FeatureManager.FeatureDescriptor descriptor = builderForValidation.getByTriplet(triplet); - if ( descriptor == null ) - throw new UserException.UnknownTribbleType(rodBinding.getTribbleType(), - String.format("Field %s had provided type %s but there's no such Tribble type. The compatible types are: %n%s", - rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); - if ( ! rodBinding.getType().isAssignableFrom(descriptor.getFeatureClass()) ) - throw new UserException.BadArgumentValue(rodBinding.getName(), - String.format("Field %s expects Features of type %s, but the input file produces Features of type %s. The compatible types are: %n%s", - rodBinding.getName(), rodBinding.getType().getSimpleName(), descriptor.getSimpleFeatureName(), - builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); - - - rodBindings.add(triplet); - } - - return rodBindings; - } - - /** - * Expand any special characters that appear in the filename. Right now, '-' is expanded to - * '/dev/stdin' only, but in the future, special characters like '~' and '*' that are passed - * directly to the command line in some circumstances could be expanded as well. Be careful - * when adding UNIX-isms. - * @param argument the text appearing on the command-line. - * @return An expanded string suitable for opening by Java/UNIX file handling utilities. - */ - private static String expandFileName(String argument) { - if(argument.trim().equals("-")) - return "/dev/stdin"; - return argument; - } - - /** - * Returns a new set of values, containing a final set of values expanded from values - *

- * Each element E of values can either be a literal string or a file ending in .list. - * For each E ending in .list we try to read a file named E from disk, and if possible - * all lines from that file are expanded into unique values. - * - * @param values Original values - * @return entries from values or the files listed in values - */ - public static Set unpackSet(Collection values) { - if (values == null) - throw new NullPointerException("values cannot be null"); - Set unpackedValues = new LinkedHashSet(); - // Let's first go through the list and see if we were given any files. - // We'll add every entry in the file to our set, and treat the entries as - // if they had been specified on the command line. - for (String value : values) { - File file = new File(value); - if (value.toLowerCase().endsWith(".list") && file.exists()) { - try { - unpackedValues.addAll(new XReadLines(file, true, LIST_FILE_COMMENT_START).readLines()); - } catch (IOException e) { - throw new UserException.CouldNotReadInputFile(file, e); - } - } else { - unpackedValues.add(value); - } - } - return unpackedValues; - } - - /** - * Returns a new set of values including only values listed by filters - *

- * Each element E of values can either be a literal string or a file. For each E, - * we try to read a file named E from disk, and if possible all lines from that file are expanded - * into unique names. - *

- * Filters may also be a file of filters. - * - * @param values Values or files with values - * @param filters Filters or files with filters - * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions - * @return entries from values or the files listed in values, filtered by filters - */ - public static Set includeMatching(Collection values, Collection filters, boolean exactMatch) { - return includeMatching(values, IDENTITY_STRING_CONVERTER, filters, exactMatch); - } - - /** - * Converts a type T to a String representation. - * - * @param Type to convert to a String. - */ - public static interface StringConverter { - String convert(T value); - } - - /** - * Returns a new set of values including only values matching filters - *

- * Filters may also be a file of filters. - *

- * The converter should convert T to a unique String for each value in the set. - * - * @param values Values or files with values - * @param converter Converts values to strings - * @param filters Filters or files with filters - * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions - * @return entries from values including only values matching filters - */ - public static Set includeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { - if (values == null) - throw new NullPointerException("values cannot be null"); - if (converter == null) - throw new NullPointerException("converter cannot be null"); - if (filters == null) - throw new NullPointerException("filters cannot be null"); - - Set unpackedFilters = unpackSet(filters); - Set filteredValues = new LinkedHashSet(); - Collection patterns = null; - if (!exactMatch) - patterns = compilePatterns(unpackedFilters); - for (T value : values) { - String converted = converter.convert(value); - if (unpackedFilters.contains(converted)) { - filteredValues.add(value); - } else if (!exactMatch) { - for (Pattern pattern : patterns) - if (pattern.matcher(converted).find()) - filteredValues.add(value); - } - } - return filteredValues; - } - - /** - * Returns a new set of values excluding any values matching filters. - *

- * Filters may also be a file of filters. - *

- * The converter should convert T to a unique String for each value in the set. - * - * @param values Values or files with values - * @param converter Converts values to strings - * @param filters Filters or files with filters - * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions - * @return entries from values exluding any values matching filters - */ - public static Set excludeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { - if (values == null) - throw new NullPointerException("values cannot be null"); - if (converter == null) - throw new NullPointerException("converter cannot be null"); - if (filters == null) - throw new NullPointerException("filters cannot be null"); - - Set unpackedFilters = unpackSet(filters); - Set filteredValues = new LinkedHashSet(); - filteredValues.addAll(values); - Collection patterns = null; - if (!exactMatch) - patterns = compilePatterns(unpackedFilters); - for (T value : values) { - String converted = converter.convert(value); - if (unpackedFilters.contains(converted)) { - filteredValues.remove(value); - } else if (!exactMatch) { - for (Pattern pattern : patterns) - if (pattern.matcher(converted).find()) - filteredValues.remove(value); - } - } - return filteredValues; - } - - private static Collection compilePatterns(Collection filters) { - Collection patterns = new ArrayList(); - for (String filter: filters) { - patterns.add(Pattern.compile(filter)); - } - return patterns; - } - - protected static final StringConverter IDENTITY_STRING_CONVERTER = new StringConverter() { - @Override - public String convert(String value) { - return value; - } - }; -} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFUtils.java deleted file mode 100644 index 6baa7b654..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFUtils.java +++ /dev/null @@ -1,316 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.samtools.SAMSequenceDictionary; -import org.apache.log4j.Logger; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.FeatureCodecHeader; -import htsjdk.tribble.index.DynamicIndexCreator; -import htsjdk.tribble.index.IndexCreator; -import htsjdk.tribble.index.IndexFactory; -import htsjdk.tribble.index.interval.IntervalIndexCreator; -import htsjdk.tribble.index.linear.LinearIndexCreator; -import htsjdk.tribble.index.tabix.TabixFormat; -import htsjdk.tribble.index.tabix.TabixIndexCreator; -import htsjdk.tribble.readers.LineIterator; -import htsjdk.tribble.readers.PositionalBufferedStream; -import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.io.stubs.VCFWriterArgumentTypeDescriptor; -import org.broadinstitute.gatk.utils.collections.Pair; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.*; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.*; - - -/** - * A set of GATK-specific static utility methods for common operations on VCF files/records. - */ -public class GATKVCFUtils { - - /** - * Constructor access disallowed...static utility methods only! - */ - private GATKVCFUtils() { } - - public static final Logger logger = Logger.getLogger(GATKVCFUtils.class); - public final static String GATK_COMMAND_LINE_KEY = "GATKCommandLine"; - - public final static GATKVCFIndexType DEFAULT_INDEX_TYPE = GATKVCFIndexType.DYNAMIC_SEEK; // by default, optimize for seek time. All indices prior to Nov 2013 used this type. - public final static Integer DEFAULT_INDEX_PARAMETER = -1; // the default DYNAMIC_SEEK does not use a parameter - - /** - * Gets the appropriately formatted header for a VCF file describing this GATK run - * - * @param engine the GATK engine that holds the walker name, GATK version, and other information - * @param argumentSources contains information on the argument values provided to the GATK for converting to a - * command line string. Should be provided from the data in the parsing engine. Can be - * empty in which case the command line will be the empty string. - * @return VCF header line describing this run of the GATK. - */ - public static VCFHeaderLine getCommandLineArgumentHeaderLine(final GenomeAnalysisEngine engine, final Collection argumentSources) { - if ( engine == null ) throw new IllegalArgumentException("engine cannot be null"); - if ( argumentSources == null ) throw new IllegalArgumentException("argumentSources cannot be null"); - - final Map attributes = new LinkedHashMap<>(); - attributes.put("ID", engine.getWalkerName()); - attributes.put("Version", CommandLineGATK.getVersionNumber()); - final Date date = new Date(); - attributes.put("Date", date.toString()); - attributes.put("Epoch", Long.toString(date.getTime())); - attributes.put("CommandLineOptions", engine.createApproximateCommandLineArgumentString(argumentSources.toArray())); - return new VCFSimpleHeaderLine(GATK_COMMAND_LINE_KEY, attributes); - } - - public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, List> rodBindings) { - // Collect the eval rod names - final Set names = new TreeSet(); - for ( final RodBinding evalRod : rodBindings ) - names.add(evalRod.getName()); - return getVCFHeadersFromRods(toolkit, names); - } - - public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit) { - return getVCFHeadersFromRods(toolkit, (Collection)null); - } - - public static Map getVCFHeadersFromRods(GenomeAnalysisEngine toolkit, Collection rodNames) { - Map data = new HashMap(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - // ignore the rod if it's not in our list - if ( rodNames != null && !rodNames.contains(source.getName()) ) - continue; - - if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) - data.put(source.getName(), (VCFHeader)source.getHeader()); - } - - return data; - } - - public static Map getVCFHeadersFromRodPrefix(GenomeAnalysisEngine toolkit,String prefix) { - Map data = new HashMap(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - // ignore the rod if lacks the prefix - if ( ! source.getName().startsWith(prefix) ) - continue; - - if ( source.getHeader() != null && source.getHeader() instanceof VCFHeader ) - data.put(source.getName(), (VCFHeader)source.getHeader()); - } - - return data; - } - - /** - * Gets the header fields from all VCF rods input by the user - * - * @param toolkit GATK engine - * - * @return a set of all fields - */ - public static Set getHeaderFields(GenomeAnalysisEngine toolkit) { - return getHeaderFields(toolkit, null); - } - - /** - * Gets the header fields from all VCF rods input by the user - * - * @param toolkit GATK engine - * @param rodNames names of rods to use, or null if we should use all possible ones - * - * @return a set of all fields - */ - public static Set getHeaderFields(GenomeAnalysisEngine toolkit, Collection rodNames) { - - // keep a map of sample name to occurrences encountered - TreeSet fields = new TreeSet(); - - // iterate to get all of the sample names - List dataSources = toolkit.getRodDataSources(); - for ( ReferenceOrderedDataSource source : dataSources ) { - // ignore the rod if it's not in our list - if ( rodNames != null && !rodNames.contains(source.getName()) ) - continue; - - if ( source.getRecordType().equals(VariantContext.class)) { - VCFHeader header = (VCFHeader)source.getHeader(); - if ( header != null ) - fields.addAll(header.getMetaDataInSortedOrder()); - } - } - - return fields; - } - - /** - * Add / replace the contig header lines in the VCFHeader with the information in the GATK engine - * - * @param header the header to update - * @param engine the GATK engine containing command line arguments and the master sequence dictionary - */ - public static VCFHeader withUpdatedContigs(final VCFHeader header, final GenomeAnalysisEngine engine) { - return VCFUtils.withUpdatedContigs(header, engine.getArguments().referenceFile, engine.getMasterSequenceDictionary()); - } - - /** - * Create and return an IndexCreator - * @param type - * @param parameter - * @param outFile - * @return - */ - public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile) { - return getIndexCreator(type, parameter, outFile, null); - } - - /** - * Create and return an IndexCreator - * @param type - * @param parameter - * @param outFile - * @param sequenceDictionary - * @return - */ - public static IndexCreator getIndexCreator(GATKVCFIndexType type, int parameter, File outFile, SAMSequenceDictionary sequenceDictionary) { - if (VCFWriterArgumentTypeDescriptor.isCompressed(outFile.toString())) { - if (type != GATKVCFUtils.DEFAULT_INDEX_TYPE || parameter != GATKVCFUtils.DEFAULT_INDEX_PARAMETER) - logger.warn("Creating Tabix index for " + outFile + ", ignoring user-specified index type and parameter"); - - if (sequenceDictionary == null) - return new TabixIndexCreator(TabixFormat.VCF); - else - return new TabixIndexCreator(sequenceDictionary, TabixFormat.VCF); - } - - IndexCreator idxCreator; - switch (type) { - case DYNAMIC_SEEK: idxCreator = new DynamicIndexCreator(outFile, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); break; - case DYNAMIC_SIZE: idxCreator = new DynamicIndexCreator(outFile, IndexFactory.IndexBalanceApproach.FOR_SIZE); break; - case LINEAR: idxCreator = new LinearIndexCreator(outFile, parameter); break; - case INTERVAL: idxCreator = new IntervalIndexCreator(outFile, parameter); break; - default: throw new IllegalArgumentException("Unknown IndexCreator type: " + type); - } - - return idxCreator; - } - - /** - * Utility class to read all of the VC records from a file - * - * @param file - * @param codec - * @return - * @throws IOException - */ - public final static Pair> readAllVCs( final File file, final FeatureCodec codec) throws IOException { - // read in the features - SOURCE source = codec.makeSourceFromStream(new FileInputStream(file)); - FeatureCodecHeader header = codec.readHeader(source); - final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); - return new Pair<>(vcfHeader, new VCIterable<>(source, codec, vcfHeader)); - } - - public static class VCIterable implements Iterable, Iterator { - final SOURCE source; - final FeatureCodec codec; - final VCFHeader header; - - private VCIterable(final SOURCE source, final FeatureCodec codec, final VCFHeader header) { - this.source = source; - this.codec = codec; - this.header = header; - } - - @Override - public Iterator iterator() { - return this; - } - - @Override - public boolean hasNext() { - return ! codec.isDone(source); - } - - @Override - public VariantContext next() { - try { - final VariantContext vc = codec.decode(source); - return vc == null ? null : vc.fullyDecode(header, false); - } catch ( IOException e ) { - throw new RuntimeException(e); - } - } - - @Override - public void remove() { - } - } - - /** - * Read all of the VCF records from source into memory, returning the header and the VariantContexts - * - * SHOULD ONLY BE USED FOR UNIT/INTEGRATION TESTING PURPOSES! - * - * @param source the file to read, must be in VCF4 format - * @return - * @throws java.io.IOException - */ - public static Pair> readVCF(final File source) throws IOException { - // read in the features - final List vcs = new ArrayList(); - final VCFCodec codec = new VCFCodec(); - PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); - final LineIterator vcfSource = codec.makeSourceFromStream(pbs); - try { - final VCFHeader vcfHeader = (VCFHeader) codec.readActualHeader(vcfSource); - - while (vcfSource.hasNext()) { - final VariantContext vc = codec.decode(vcfSource); - if ( vc != null ) - vcs.add(vc); - } - - return new Pair>(vcfHeader, vcs); - } finally { - codec.close(vcfSource); - } - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java deleted file mode 100644 index a099d8c9c..000000000 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java +++ /dev/null @@ -1,1960 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import htsjdk.tribble.TribbleException; -import htsjdk.tribble.util.popgen.HardyWeinbergCalculation; -import htsjdk.variant.variantcontext.*; -import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.collections.Pair; - -import java.io.Serializable; -import java.util.*; - -public class GATKVariantContextUtils { - - private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); - - public static final int DEFAULT_PLOIDY = HomoSapiensConstants.DEFAULT_PLOIDY; - - public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - - /** - * Diploid NO_CALL allele list... - * - * @deprecated you should use {@link #noCallAlleles(int)} instead. It indicates the presence of a hardcoded diploid assumption which is bad. - */ - @Deprecated - public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - - public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; - public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site - - public final static String MERGE_FILTER_PREFIX = "filterIn"; - public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; - public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; - public final static String MERGE_INTERSECTION = "Intersection"; - - /** - * Checks whether a variant-context overlaps with a region. - * - *

- * No event overlaps an unmapped region. - *

- * - * @param variantContext variant-context to test the overlap with. - * @param region region to test the overlap with. - * - * @throws IllegalArgumentException if either region or event is {@code null}. - * - * @return {@code true} if there is an overlap between the event described and the active region provided. - */ - public static boolean overlapsRegion(final VariantContext variantContext, final GenomeLoc region) { - if (region == null) throw new IllegalArgumentException("the active region provided cannot be null"); - if (variantContext == null) throw new IllegalArgumentException("the variant context provided cannot be null"); - if (region.isUnmapped()) - return false; - if (variantContext.getEnd() < region.getStart()) - return false; - if (variantContext.getStart() > region.getStop()) - return false; - if (!variantContext.getChr().equals(region.getContig())) - return false; - return true; - } - - /** - * Returns a homozygous call allele list given the only allele and the ploidy. - * - * @param allele the only allele in the allele list. - * @param ploidy the ploidy of the resulting allele list. - * - * @throws IllegalArgumentException if {@code allele} is {@code null} or ploidy is negative. - * - * @return never {@code null}. - */ - public static List homozygousAlleleList(final Allele allele, final int ploidy) { - if (allele == null || ploidy < 0) - throw new IllegalArgumentException(); - - // Use a tailored inner class to implement the list: - return Collections.nCopies(ploidy,allele); - } - - private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { - final Iterator it1 = alleleSet1.iterator(); - final Iterator it2 = alleleSet2.iterator(); - - while ( it1.hasNext() && it2.hasNext() ) { - final Allele a1 = it1.next(); - final Allele a2 = it2.next(); - if ( ! a1.equals(a2) ) - return true; - } - - // by this point, at least one of the iterators is empty. All of the elements - // we've compared are equal up until this point. But it's possible that the - // sets aren't the same size, which is indicated by the test below. If they - // are of the same size, though, the sets are compatible - return it1.hasNext() || it2.hasNext(); - } - - /** - * Determines the common reference allele - * - * @param VCs the list of VariantContexts - * @param loc if not null, ignore records that do not begin at this start location - * @return possibly null Allele - */ - protected static Allele determineReferenceAllele(final List VCs, final GenomeLoc loc) { - Allele ref = null; - - for ( final VariantContext vc : VCs ) { - if ( contextMatchesLoc(vc, loc) ) { - final Allele myRef = vc.getReference(); - if ( ref == null || ref.length() < myRef.length() ) - ref = myRef; - else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) - throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); - } - } - - return ref; - } - - /** - * Calculates the total ploidy of a variant context as the sum of all plodies across genotypes. - * @param vc the target variant context. - * @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype. - * @return never {@code null}. - */ - public static int totalPloidy(final VariantContext vc, final int defaultPloidy) { - if (vc == null) - throw new IllegalArgumentException("the vc provided cannot be null"); - if (defaultPloidy < 0) - throw new IllegalArgumentException("the default ploidy must 0 or greater"); - int result = 0; - for (final Genotype genotype : vc.getGenotypes()) { - final int declaredPloidy = genotype.getPloidy(); - result += declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; - } - - return result; - } - - public enum GenotypeMergeType { - /** - * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. - */ - UNIQUIFY, - /** - * Take genotypes in priority order (see the priority argument). - */ - PRIORITIZE, - /** - * Take the genotypes in any order. - */ - UNSORTED, - /** - * Require that all samples/genotypes be unique between all inputs. - */ - REQUIRE_UNIQUE - } - - public enum FilteredRecordMergeType { - /** - * Union - leaves the record if any record is unfiltered. - */ - KEEP_IF_ANY_UNFILTERED, - /** - * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. - */ - KEEP_IF_ALL_UNFILTERED, - /** - * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. - */ - KEEP_UNCONDITIONAL - } - - public enum MultipleAllelesMergeType { - /** - * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. - */ - BY_TYPE, - /** - * Merge all allele types at the same start position into the same VCF record. - */ - MIX_TYPES - } - - /** - * Refactored out of the AverageAltAlleleLength annotation class - * @param vc the variant context - * @return the average length of the alt allele (a double) - */ - public static double getMeanAltAlleleLength(VariantContext vc) { - double averageLength = 1.0; - if ( ! vc.isSNP() && ! vc.isSymbolic() ) { - // adjust for the event length - int averageLengthNum = 0; - int averageLengthDenom = 0; - int refLength = vc.getReference().length(); - for ( final Allele a : vc.getAlternateAlleles() ) { - int numAllele = vc.getCalledChrCount(a); - int alleleSize; - if ( a.length() == refLength ) { - // SNP or MNP - byte[] a_bases = a.getBases(); - byte[] ref_bases = vc.getReference().getBases(); - int n_mismatch = 0; - for ( int idx = 0; idx < a_bases.length; idx++ ) { - if ( a_bases[idx] != ref_bases[idx] ) - n_mismatch++; - } - alleleSize = n_mismatch; - } - else if ( a.isSymbolic() ) { - alleleSize = 1; - } else { - alleleSize = Math.abs(refLength-a.length()); - } - averageLengthNum += alleleSize*numAllele; - averageLengthDenom += numAllele; - } - averageLength = ( (double) averageLengthNum )/averageLengthDenom; - } - - return averageLength; - } - - /** - * create a genome location, given a variant context - * @param genomeLocParser parser - * @param vc the variant context - * @return the genomeLoc - */ - public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { - return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); - } - - public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { - if (!context.isSNP() || !context.isBiallelic()) - throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); - return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); - } - - /** - * If this is a BiAllelic SNP, is it a transition? - */ - public static boolean isTransition(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - /** - * If this is a BiAllelic SNP, is it a transversion? - */ - public static boolean isTransversion(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - public static boolean isTransition(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - public static boolean isTransversion(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - /** - * Returns a context identical to this with the REF and ALT alleles reverse complemented. - * - * @param vc variant context - * @return new vc - */ - public static VariantContext reverseComplement(VariantContext vc) { - // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); - for ( final Allele originalAllele : vc.getAlleles() ) { - Allele newAllele; - if ( originalAllele.isNoCall() ) - newAllele = originalAllele; - else - newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); - alleleMap.put(originalAllele, newAllele); - } - - // create new Genotype objects - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList<>(); - for ( final Allele allele : genotype.getAlleles() ) { - Allele newAllele = alleleMap.get(allele); - if ( newAllele == null ) - newAllele = Allele.NO_CALL; - newAlleles.add(newAllele); - } - newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); - } - - return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); - } - - /** - * Returns true iff VC is an non-complex indel where every allele represents an expansion or - * contraction of a series of identical bases in the reference. - * - * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT - * - * If VC = -/CT, then this function returns true because the CT insertion matches exactly the - * upcoming reference. - * If VC = -/CTA then this function returns false because the CTA isn't a perfect match - * - * Now consider deletions: - * - * If VC = CT/- then again the same logic applies and this returns true - * The case of CTA/- makes no sense because it doesn't actually match the reference bases. - * - * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For - * each insertion allele of n bases, check if that allele matches the next n reference bases. - * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, - * as it must necessarily match the first n bases. If this test returns true for all - * alleles you are a tandem repeat, otherwise you are not. - * - * @param vc - * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return false; - - final Allele ref = vc.getReference(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) - return false; - } - - // we've passed all of the tests, so we are a repeat - return true; - } - - /** - * - * @param vc - * @param refBasesStartingAtVCWithPad - * @return - */ - @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) - public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { - final boolean VERBOSE = false; - final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); - if ( ! vc.isIndel() ) // only indels are tandem repeats - return null; - - final Allele refAllele = vc.getReference(); - final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); - - byte[] repeatUnit = null; - final ArrayList lengths = new ArrayList<>(); - - for ( final Allele allele : vc.getAlternateAlleles() ) { - Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); - - final int[] repetitionCount = result.first; - // repetition count = 0 means allele is not a tandem expansion of context - if (repetitionCount[0] == 0 || repetitionCount[1] == 0) - return null; - - if (lengths.size() == 0) { - lengths.add(repetitionCount[0]); // add ref allele length only once - } - lengths.add(repetitionCount[1]); // add this alt allele's length - - repeatUnit = result.second; - if (VERBOSE) { - System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); - System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); - System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); - System.out.println("RU:"+new String(repeatUnit)); - } - } - - return new Pair, byte[]>(lengths,repeatUnit); - } - - public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { - /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. - Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. - */ - - byte[] longB; - // find first repeat unit based on either ref or alt, whichever is longer - if (altBases.length > refBases.length) - longB = altBases; - else - longB = refBases; - - // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units - // for example, -*,CACA needs to first be decomposed into (CA)2 - final int repeatUnitLength = findRepeatedSubstring(longB); - final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); - - final int[] repetitionCount = new int[2]; - // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) - int repetitionsInRef = findNumberOfRepetitions(repeatUnit, refBases, true); - repetitionCount[0] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; - repetitionCount[1] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; - - return new Pair<>(repetitionCount, repeatUnit); - - } - - /** - * Find out if a string can be represented as a tandem number of substrings. - * For example ACTACT is a 2-tandem of ACT, - * but ACTACA is not. - * - * @param bases String to be tested - * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't - * be represented as one, it will be just the length of the input string) - */ - public static int findRepeatedSubstring(byte[] bases) { - - int repLength; - for (repLength=1; repLength <=bases.length; repLength++) { - final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); - boolean allBasesMatch = true; - for (int start = repLength; start < bases.length; start += repLength ) { - // check that remaining of string is exactly equal to repeat unit - final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); - if (!Arrays.equals(candidateRepeatUnit, basePiece)) { - allBasesMatch = false; - break; - } - } - if (allBasesMatch) - return repLength; - } - - return repLength; - } - - /** - * Helper routine that finds number of repetitions a string consists of. - * For example, for string ATAT and repeat unit AT, number of repetitions = 2 - * @param repeatUnit Substring - * @param testString String to test - * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) - * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's - */ - public static int findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { - int numRepeats = 0; - if (lookForward) { - // look forward on the test string - for (int start = 0; start < testString.length; start += repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 - // look forward on the test string - for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { - int end = start + repeatUnit.length; - byte[] unit = Arrays.copyOfRange(testString,start, end); - if(Arrays.equals(unit,repeatUnit)) - numRepeats++; - else - break; - } - return numRepeats; - } - - /** - * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference - * @param ref - * @param alt - * @param refBasesStartingAtVCWithoutPad - * @return - */ - protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { - if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) - return false; // we require one allele be a prefix of another - - if ( ref.length() > alt.length() ) { // we are a deletion - return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); - } else { // we are an insertion - return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); - } - } - - protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { - final String potentialRepeat = l.substring(s.length()); // skip s bases - - for ( int i = 0; i < minNumberOfMatches; i++) { - final int start = i * potentialRepeat.length(); - final int end = (i+1) * potentialRepeat.length(); - if ( ref.length() < end ) - return false; // we ran out of bases to test - final String refSub = ref.substring(start, end); - if ( ! refSub.equals(potentialRepeat) ) - return false; // repeat didn't match, fail - } - - return true; // we passed all tests, we matched - } - - public enum GenotypeAssignmentMethod { - /** - * set all of the genotype GT values to NO_CALL - */ - SET_TO_NO_CALL, - - /** - * Use the subsetted PLs to greedily assigned genotypes - */ - USE_PLS_TO_ASSIGN, - - /** - * Try to match the original GT calls, if at all possible - * - * Suppose I have 3 alleles: A/B/C and the following samples: - * - * original_GT best_match to A/B best_match to A/C - * S1 => A/A A/A A/A - * S2 => A/B A/B A/A - * S3 => B/B B/B A/A - * S4 => B/C A/B A/C - * S5 => C/C A/A C/C - * - * Basically, all alleles not in the subset map to ref. It means that het-alt genotypes - * when split into 2 bi-allelic variants will be het in each, which is good in some cases, - * rather than the undetermined behavior when using the PLs to assign, which could result - * in hom-var or hom-ref for each, depending on the exact PL values. - */ - BEST_MATCH_TO_ORIGINAL, - - /** - * do not even bother changing the GTs - */ - DO_NOT_ASSIGN_GENOTYPES - } - - /** - * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) - * - * @param vc variant context with genotype likelihoods - * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** - * @param assignGenotypes assignment strategy for the (subsetted) PLs - * @return a new non-null GenotypesContext - */ - public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, - final List allelesToUse, - final GenotypeAssignmentMethod assignGenotypes) { - if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); - if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); - - // optimization: if no input genotypes, just exit - if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); - - // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); - } - - /** - * Figure out which likelihood indexes to use for a selected down set of alleles - * - * @param originalVC the original VariantContext - * @param allelesToUse the subset of alleles to use - * @return a list of PL indexes to use or null if none - */ - private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { - - // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); - - // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, - // then we can keep the PLs as is; otherwise, we determine which ones to keep - if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) - return null; - - return getLikelihoodIndexes(originalVC, alleleIndexesToUse); - } - - /** - * Get the actual likelihoods indexes to use given the corresponding allele indexes - * - * @param originalVC the original VariantContext - * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) - * @return a non-null List - */ - private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { - - final List result = new ArrayList<>(30); - - // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 - final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); - - for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - // consider this entry only if both of the alleles are good - if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) - result.add(PLindex); - } - - return result; - } - - /** - * Given an original VariantContext and a list of alleles from that VC to keep, - * returns a bitset representing which allele indexes should be kept - * - * @param originalVC the original VC - * @param allelesToKeep the list of alleles to keep - * @return non-null bitset - */ - private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { - final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; - final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; - - // the reference Allele is definitely still used - alleleIndexesToKeep[0] = true; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) { - if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) - alleleIndexesToKeep[i+1] = true; - } - - return alleleIndexesToKeep; - } - - /** - * Create the new GenotypesContext with the subsetted PLs and ADs - * - * @param originalGs the original GenotypesContext - * @param vc the original VariantContext - * @param allelesToUse the actual alleles to use with the new Genotypes - * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) - * @param assignGenotypes assignment strategy for the (subsetted) PLs - * @return a new non-null GenotypesContext - */ - private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, - final VariantContext vc, - final List allelesToUse, - final List likelihoodIndexesToUse, - final GenotypeAssignmentMethod assignGenotypes) { - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - - // make sure we are seeing the expected number of likelihoods per sample - final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); - - // the samples - final List sampleIndices = originalGs.getSampleNamesOrderedByName(); - - // create the new genotypes - for ( int k = 0; k < originalGs.size(); k++ ) { - final Genotype g = originalGs.get(sampleIndices.get(k)); - final GenotypeBuilder gb = new GenotypeBuilder(g); - - // create the new likelihoods array from the alleles we are allowed to use - double[] newLikelihoods; - if ( !g.hasLikelihoods() ) { - // we don't have any likelihoods, so we null out PLs and make G ./. - newLikelihoods = null; - gb.noPL(); - } else { - final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); - if ( likelihoodIndexesToUse == null ) { - newLikelihoods = originalLikelihoods; - } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { - logger.debug("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); - newLikelihoods = null; - } else { - newLikelihoods = new double[likelihoodIndexesToUse.size()]; - int newIndex = 0; - for ( final int oldIndex : likelihoodIndexesToUse ) - newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; - - // might need to re-normalize - newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); - } - - if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) - gb.noPL(); - else - gb.PL(newLikelihoods); - } - - updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); - newGTs.add(gb.make()); - } - - return fixADFromSubsettedAlleles(newGTs, vc, allelesToUse); - } - - private static boolean likelihoodsAreUninformative(final double[] likelihoods) { - return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL; - } - - /** - * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod - * - * @param originalGT the original genotype calls, cannot be null - * @param gb the builder where we should put our newly called alleles, cannot be null - * @param assignmentMethod the method to use to do the assignment, cannot be null - * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null - * @param allelesToUse the alleles we are using for our subsetting - */ - public static void updateGenotypeAfterSubsetting(final List originalGT, - final GenotypeBuilder gb, - final GenotypeAssignmentMethod assignmentMethod, - final double[] newLikelihoods, - final List allelesToUse) { - switch ( assignmentMethod ) { - case DO_NOT_ASSIGN_GENOTYPES: - break; - case SET_TO_NO_CALL: - gb.alleles(NO_CALL_ALLELES); - gb.noGQ(); - break; - case USE_PLS_TO_ASSIGN: - if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { - // if there is no mass on the (new) likelihoods, then just no-call the sample - gb.alleles(NO_CALL_ALLELES); - gb.noGQ(); - } else { - // find the genotype with maximum likelihoods - final int PLindex = MathUtils.maxElementIndex(newLikelihoods); - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); - gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); - } - break; - case BEST_MATCH_TO_ORIGINAL: - final List best = new LinkedList<>(); - final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument - for ( final Allele originalAllele : originalGT ) { - best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); - } - gb.noGQ(); - gb.noPL(); - gb.alleles(best); - break; - } - } - - /** - * Subset the samples in VC to reference only information with ref call alleles - * - * Preserves DP if present - * - * @param vc the variant context to subset down to - * @param ploidy ploidy to use if a genotype doesn't have any alleles - * @return a GenotypesContext - */ - public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) { - if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); - if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy); - - // the genotypes with PLs - final GenotypesContext oldGTs = vc.getGenotypes(); - - // optimization: if no input genotypes, just exit - if (oldGTs.isEmpty()) return oldGTs; - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); - - final Allele ref = vc.getReference(); - final List diploidRefAlleles = Arrays.asList(ref, ref); - - // create the new genotypes - for ( final Genotype g : vc.getGenotypes() ) { - final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); - final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); - final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); - if ( g.hasDP() ) gb.DP(g.getDP()); - if ( g.hasGQ() ) gb.GQ(g.getGQ()); - newGTs.add(gb.make()); - } - - return newGTs; - } - - /** - * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs - * - * @param vc variant context with genotype likelihoods - * @return genotypes context - */ - public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { - return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); - } - - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc) { - return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL); - } - - /** - * Split variant context into its biallelic components if there are more than 2 alleles - * - * For VC has A/B/C alleles, returns A/B and A/C contexts. - * Genotypes are all no-calls now (it's not possible to fix them easily) - * Alleles are right trimmed to satisfy VCF conventions - * - * If vc is biallelic or non-variant it is just returned - * - * Chromosome counts are updated (but they are by definition 0) - * - * @param vc a potentially multi-allelic variant context - * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome - * @return a list of bi-allelic (or monomorphic) variant context - */ - public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { - if ( ! vc.isVariant() || vc.isBiallelic() ) - // non variant or biallelics already satisfy the contract - return Collections.singletonList(vc); - else { - final List biallelics = new LinkedList<>(); - - for ( final Allele alt : vc.getAlternateAlleles() ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - final List alleles = Arrays.asList(vc.getReference(), alt); - builder.alleles(alleles); - builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); - VariantContextUtils.calculateChromosomeCounts(builder, true); - final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); - biallelics.add(trimmed); - } - - return biallelics; - } - } - - public static Genotype removePLsAndAD(final Genotype g) { - return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; - } - - //TODO consider refactor variant-context merging code so that we share as much as possible between - //TODO simpleMerge and referenceConfidenceMerge - //TODO likely using a separate helper class or hierarchy. - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name - * - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final List priorityListOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { - int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); - } - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name. - * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use - * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. - * - * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ - * - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param setKey the key name of the set - * @param filteredAreUncalled are filtered records uncalled? - * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext representing the merge of unsortedVCs - */ - public static VariantContext simpleMerge(final Collection unsortedVCs, - final List priorityListOfVCs, - final int originalNumOfVCs, - final FilteredRecordMergeType filteredRecordMergeType, - final GenotypeMergeType genotypeMergeOptions, - final boolean annotateOrigin, - final boolean printMessages, - final String setKey, - final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC ) { - if ( unsortedVCs == null || unsortedVCs.size() == 0 ) - return null; - - if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) - throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); - - if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) - throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); - - final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); - // Make sure all variant contexts are padded with reference base in case of indels if necessary - List VCs = new ArrayList<>(); - - for (final VariantContext vc : preFilteredVCs) { - if ( ! filteredAreUncalled || vc.isNotFiltered() ) - VCs.add(vc); - } - - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled - return null; - - // establish the baseline info from the first VC - final VariantContext first = VCs.get(0); - final String name = first.getSource(); - final Allele refAllele = determineReferenceAllele(VCs); - - final LinkedHashSet alleles = new LinkedHashSet<>(); - final Set filters = new HashSet<>(); - final Map attributes = new LinkedHashMap<>(); - final Set inconsistentAttributes = new HashSet<>(); - final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant - final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id - - VariantContext longestVC = first; - int depth = 0; - int maxAC = -1; - final Map attributesWithMaxAC = new LinkedHashMap<>(); - double log10PError = CommonInfo.NO_LOG10_PERROR; - boolean anyVCHadFiltersApplied = false; - VariantContext vcWithMaxAC = null; - GenotypesContext genotypes = GenotypesContext.create(); - - // counting the number of filtered and variant VCs - int nFiltered = 0; - - boolean remapped = false; - - // cycle through and add info from the other VCs, making sure the loc/reference matches - for ( final VariantContext vc : VCs ) { - if ( longestVC.getStart() != vc.getStart() ) - throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); - - if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) - longestVC = vc; // get the longest location - - nFiltered += vc.isFiltered() ? 1 : 0; - if ( vc.isVariant() ) variantSources.add(vc.getSource()); - - AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); - remapped = remapped || alleleMapping.needsRemapping(); - - alleles.addAll(alleleMapping.values()); - - mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); - - // We always take the QUAL of the first VC with a non-MISSING qual for the combined value - if ( log10PError == CommonInfo.NO_LOG10_PERROR ) - log10PError = vc.getLog10PError(); - - filters.addAll(vc.getFilters()); - anyVCHadFiltersApplied |= vc.filtersWereApplied(); - - // - // add attributes - // - // special case DP (add it up) and ID (just preserve it) - // - if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if ( vc.hasID() ) rsIDs.add(vc.getID()); - if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); - // lets see if the string contains a "," separator - if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { - final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); - for (final String alleleCount : alleleCountArray) { - final int ac = Integer.valueOf(alleleCount.trim()); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } else { - final int ac = Integer.valueOf(rawAlleleCounts); - if (ac > maxAC) { - maxAC = ac; - vcWithMaxAC = vc; - } - } - } - - for (final Map.Entry p : vc.getAttributes().entrySet()) { - final String key = p.getKey(); - final Object value = p.getValue(); - // only output annotations that have the same value in every input VC - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - - if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - attributes.put(key, value); - } - } - } - } - - // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD - for ( final VariantContext vc : VCs ) { - if (vc.getAlleles().size() == 1) - continue; - if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { - if ( ! genotypes.isEmpty() ) { - logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", - vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); - } - genotypes = stripPLsAndAD(genotypes); - // this will remove stale AC,AF attributed from vc - VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); - break; - } - } - - // take the VC with the maxAC and pull the attributes into a modifiable map - if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); - } - - // if at least one record was unfiltered and we want a union, clear all of the filters - if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) - filters.clear(); - - - if ( annotateOrigin ) { // we care about where the call came from - String setValue; - if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered - setValue = MERGE_INTERSECTION; - else if ( nFiltered == VCs.size() ) // everything was filtered out - setValue = MERGE_FILTER_IN_ALL; - else if ( variantSources.isEmpty() ) // everyone was reference - setValue = MERGE_REF_IN_ALL; - else { - final LinkedHashSet s = new LinkedHashSet<>(); - for ( final VariantContext vc : VCs ) - if ( vc.isVariant() ) - s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); - setValue = Utils.join("-", s); - } - - if ( setKey != null ) { - attributes.put(setKey, setValue); - if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { - attributesWithMaxAC.put(setKey, setValue); - } - } - } - - if ( depth > 0 ) - attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - - final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - - final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); - builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); - builder.alleles(alleles); - builder.genotypes(genotypes); - builder.log10PError(log10PError); - if ( anyVCHadFiltersApplied ) { - builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); - } - builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); - - // Trim the padded bases of all alleles if necessary - final VariantContext merged = builder.make(); - if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); - return merged; - } - - //TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping. - - public static GenotypesContext stripPLsAndAD(final GenotypesContext genotypes) { - final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); - - for ( final Genotype g : genotypes ) { - newGs.add(removePLsAndAD(g)); - } - - return newGs; - } - - /** - * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles - * from the original VariantContext are no longer present. - * - * @param selectedVC the selected (new) VariantContext - * @param originalVC the original VariantContext - * @return a new non-null GenotypesContext - */ - public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { - final int numNewAlleles = selectedVC.getAlleles().size(); - final int numOriginalAlleles = originalVC.getAlleles().size(); - - // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong - if ( numNewAlleles > numOriginalAlleles ) - throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); - - final GenotypesContext oldGs = selectedVC.getGenotypes(); - - // if we have the same number of alternate alleles in the selected VC as in the original VC, then we don't need to fix anything - if ( numNewAlleles == numOriginalAlleles ) - return oldGs; - - return fixGenotypesFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); - } - - /** - * Fix the PLs and ADs for the GenotypesContext of a VariantContext that has been subset - * - * @param originalGs the original GenotypesContext - * @param originalVC the original VariantContext - * @param allelesToUse the new (sub)set of alleles to use - * @return a new non-null GenotypesContext - */ - static private GenotypesContext fixGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { - - // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward - final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); - - // create the new genotypes - return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); - } - - /** - * Fix the AD for the GenotypesContext of a VariantContext that has been subset - * - * @param originalGs the original GenotypesContext - * @param originalVC the original VariantContext - * @param allelesToUse the new (sub)set of alleles to use - * @return a new non-null GenotypesContext - */ - static private GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { - - // the bitset representing the allele indexes we want to keep - final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); - - // the new genotypes to create - final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); - - // the samples - final List sampleIndices = originalGs.getSampleNamesOrderedByName(); - - // create the new genotypes - for ( int k = 0; k < originalGs.size(); k++ ) { - final Genotype g = originalGs.get(sampleIndices.get(k)); - newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); - } - - return newGTs; - } - - /** - * Fix the AD for the given Genotype - * - * @param genotype the original Genotype - * @param alleleIndexesToUse a bitset describing whether or not to keep a given index - * @param nAllelesToUse how many alleles we are keeping - * @return a non-null Genotype - */ - private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { - // if it ain't broke don't fix it - if ( !genotype.hasAD() ) - return genotype; - - final GenotypeBuilder builder = new GenotypeBuilder(genotype); - - final int[] oldAD = genotype.getAD(); - if ( oldAD.length != alleleIndexesToUse.length ) { - builder.noAD(); - } else { - final int[] newAD = new int[nAllelesToUse]; - int currentIndex = 0; - for ( int i = 0; i < oldAD.length; i++ ) { - if ( alleleIndexesToUse[i] ) - newAD[currentIndex++] = oldAD[i]; - } - builder.AD(newAD); - } - return builder.make(); - } - - private static Allele determineReferenceAllele(final List VCs) { - return determineReferenceAllele(VCs, null); - } - - public static boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { - return loc == null || loc.getStart() == vc.getStart(); - } - - static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final LinkedHashSet allAlleles) { - if ( refAllele.equals(vc.getReference()) ) - return new AlleleMapper(vc); - else { - final Map map = createAlleleMapping(refAllele, vc, allAlleles); - map.put(vc.getReference(), refAllele); - return new AlleleMapper(map); - } - } - - //TODO as part of a larger refactoring effort {@link #createAlleleMapping} can be merged with {@link ReferenceConfidenceVariantContextMerger#remapAlleles}. - /** - * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele - * - * The refAllele is the longest reference allele seen at this start site. - * So imagine it is: - * refAllele: ACGTGA - * myRef: ACGT - * myAlt: A - * - * We need to remap all of the alleles in vc to include the extra GA so that - * myRef => refAllele and myAlt => AGA - * - * @param refAllele the new (extended) reference allele - * @param oneVC the Variant Context to extend - * @param currentAlleles the list of alleles already created - * @return a non-null mapping of original alleles to new (extended) ones - */ - private static Map createAlleleMapping(final Allele refAllele, - final VariantContext oneVC, - final Collection currentAlleles) { - final Allele myRef = oneVC.getReference(); - if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); - - final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); - - final Map map = new HashMap<>(); - for ( final Allele a : oneVC.getAlternateAlleles() ) { - if ( isUsableAlternateAllele(a) ) { - Allele extended = Allele.extend(a, extraBases); - for ( final Allele b : currentAlleles ) - if ( extended.equals(b) ) - extended = b; - map.put(a, extended); - } - } - - return map; - } - - static private boolean isUsableAlternateAllele(final Allele allele) { - return ! (allele.isReference() || allele.isSymbolic() ); - } - - public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { - if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) - throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); - - if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) - return new ArrayList<>(unsortedVCs); - else { - ArrayList sorted = new ArrayList<>(unsortedVCs); - Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); - return sorted; - } - } - - private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { - //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE - for ( final Genotype g : oneVC.getGenotypes() ) { - final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); - if ( ! mergedGenotypes.containsSample(name) ) { - // only add if the name is new - Genotype newG = g; - - if ( uniquifySamples || alleleMapping.needsRemapping() ) { - final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); - newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); - } - - mergedGenotypes.add(newG); - } - } - } - - /** - * Cached NO_CALL immutable lists where the position ith contains the list with i elements. - */ - private static List[] NOCALL_LISTS = new List[] { - Collections.emptyList(), - Collections.singletonList(Allele.NO_CALL), - Collections.nCopies(2,Allele.NO_CALL) - }; - - /** - * Synchronized code to ensure that {@link #NOCALL_LISTS} has enough entries beyod the requested ploidy - * @param capacity the requested ploidy. - */ - private static synchronized void ensureNoCallListsCapacity(final int capacity) { - final int currentCapacity = NOCALL_LISTS.length - 1; - if (currentCapacity >= capacity) - return; - NOCALL_LISTS = Arrays.copyOf(NOCALL_LISTS,Math.max(capacity,currentCapacity << 1) + 1); - for (int i = currentCapacity + 1; i < NOCALL_LISTS.length; i++) - NOCALL_LISTS[i] = Collections.nCopies(i,Allele.NO_CALL); - } - - /** - * Returns a {@link Allele#NO_CALL NO_CALL} allele list provided the ploidy. - * - * @param ploidy the required ploidy. - * - * @return never {@code null}, but an empty list if {@code ploidy} is equal or less than 0. The returned list - * might or might not be mutable. - */ - public static List noCallAlleles(final int ploidy) { - if (NOCALL_LISTS.length <= ploidy) - ensureNoCallListsCapacity(ploidy); - return NOCALL_LISTS[ploidy]; - } - - - /** - * This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex() - * - * @param originalIndex1 the index of the first allele - * @param originalIndex2 the index of the second allele - * @return the PL index - */ - protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) { - // we need to make sure they are ordered correctly - return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2); - } - - public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { - return uniquify ? sampleName + "." + trackName : sampleName; - } - - /** - * Trim the alleles in inputVC from the reverse direction - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up - */ - public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - return trimAlleles(inputVC, false, true); - } - - /** - * Trim the alleles in inputVC from the forward direction - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up - */ - public static VariantContext forwardTrimAlleles( final VariantContext inputVC ) { - return trimAlleles(inputVC, true, false); - } - - /** - * Trim the alleles in inputVC forward and reverse, as requested - * - * @param inputVC a non-null input VC whose alleles might need a haircut - * @param trimForward should we trim up the alleles from the forward direction? - * @param trimReverse should we trim up the alleles from the reverse direction? - * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles - */ - @Ensures("result != null") - public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { - if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); - - if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) - return inputVC; - - // see whether we need to trim common reference base from all alleles - final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; - final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); - final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; - final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); - return vc; - } - - /** - * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and - * the last revTrim bases from the end - * - * @param inputVC a non-null input VC - * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles - * @param revTrim the last revTrim bases of each allele will be clipped off as well - * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles - */ - @Requires({"inputVC != null"}) - @Ensures("result != null") - protected static VariantContext trimAlleles(final VariantContext inputVC, - final int fwdTrimEnd, - final int revTrim) { - if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified - return inputVC; - - final List alleles = new LinkedList<>(); - final Map originalToTrimmedAlleleMap = new HashMap<>(); - - for (final Allele a : inputVC.getAlleles()) { - if (a.isSymbolic()) { - alleles.add(a); - originalToTrimmedAlleleMap.put(a, a); - } else { - // get bases for current allele and create a new one with trimmed bases - final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim); - final Allele trimmedAllele = Allele.create(newBases, a.isReference()); - alleles.add(trimmedAllele); - originalToTrimmedAlleleMap.put(a, trimmedAllele); - } - } - - // now we can recreate new genotypes with trimmed alleles - final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); - final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); - - final int start = inputVC.getStart() + (fwdTrimEnd + 1); - final VariantContextBuilder builder = new VariantContextBuilder(inputVC); - builder.start(start); - builder.stop(start + alleles.get(0).length() - 1); - builder.alleles(alleles); - builder.genotypes(genotypes); - return builder.make(); - } - - @Requires("originalGenotypes != null && alleleMapper != null") - protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { - final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); - - for ( final Genotype genotype : originalGenotypes ) { - final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); - updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); - } - - return updatedGenotypes; - } - - public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { - int clipping = 0; - boolean stillClipping = true; - - while ( stillClipping ) { - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - continue; - - // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong - // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). - if ( a.length() - clipping == 0 ) - return clipping - 1; - - if ( a.length() - clipping <= 0 || a.length() == 0 ) { - stillClipping = false; - } - else if ( ref.length == clipping ) { - return -1; - } - else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { - stillClipping = false; - } - } - if ( stillClipping ) - clipping++; - } - - return clipping; - } - - /** - * Clip out any unnecessary bases off the front of the alleles - * - * The VCF spec represents alleles as block substitutions, replacing AC with A for a - * 1 bp deletion of the C. However, it's possible that we'd end up with alleles that - * contain extra bases on the left, such as GAC/GA to represent the same 1 bp deletion. - * This routine finds an offset among all alleles that can be safely trimmed - * off the left of each allele and still represent the same block substitution. - * - * A/C => A/C - * AC/A => AC/A - * ACC/AC => CC/C - * AGT/CAT => AGT/CAT - * /C => /C - * - * @param unclippedAlleles a non-null list of alleles that we want to clip - * @return the offset into the alleles where we can safely clip, inclusive, or - * -1 if no clipping is tolerated. So, if the result is 0, then we can remove - * the first base of every allele. If the result is 1, we can remove the - * second base. - */ - public static int computeForwardClipping(final List unclippedAlleles) { - // cannot clip unless there's at least 1 alt allele - if ( unclippedAlleles.size() <= 1 ) - return -1; - - // we cannot forward clip any set of alleles containing a symbolic allele - int minAlleleLength = Integer.MAX_VALUE; - for ( final Allele a : unclippedAlleles ) { - if ( a.isSymbolic() ) - return -1; - minAlleleLength = Math.min(minAlleleLength, a.length()); - } - - final byte[] firstAlleleBases = unclippedAlleles.get(0).getBases(); - int indexOflastSharedBase = -1; - - // the -1 to the stop is that we can never clip off the right most base - for ( int i = 0; i < minAlleleLength - 1; i++) { - final byte base = firstAlleleBases[i]; - - for ( final Allele allele : unclippedAlleles ) { - if ( allele.getBases()[i] != base ) - return indexOflastSharedBase; - } - - indexOflastSharedBase = i; - } - - return indexOflastSharedBase; - } - - public static double computeHardyWeinbergPvalue(VariantContext vc) { - if ( vc.getCalledChrCount() == 0 ) - return 0.0; - return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); - } - - public static boolean requiresPaddingBase(final List alleles) { - - // see whether one of the alleles would be null if trimmed through - - for ( final String allele : alleles ) { - if ( allele.isEmpty() ) - return true; - } - - int clipping = 0; - Character currentBase = null; - - while ( true ) { - for ( final String allele : alleles ) { - if ( allele.length() - clipping == 0 ) - return true; - - char myBase = allele.charAt(clipping); - if ( currentBase == null ) - currentBase = myBase; - else if ( currentBase != myBase ) - return false; - } - - clipping++; - currentBase = null; - } - } - - private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { - Map attributes = new HashMap<>(keysToPreserve.size()); - for ( final String key : keysToPreserve ) { - if ( igc.hasAttribute(key) ) - attributes.put(key, igc.getAttribute(key)); - } - return attributes; - } - - /** - * @deprecated use variant context builder version instead - * @param vc the variant context - * @param keysToPreserve the keys to preserve - * @return a pruned version of the original variant context - */ - @Deprecated - public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { - return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); - } - - public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { - final VariantContext vc = builder.make(); - if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); - - // VC info - final Map attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); - - // Genotypes - final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - final GenotypeBuilder gb = new GenotypeBuilder(g); - // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ - gb.noAD().noDP().noPL().noAttributes(); - genotypes.add(gb.make()); - } - - return builder.genotypes(genotypes).attributes(attributes); - } - - public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { - // if all alleles of vc1 are a contained in alleles of vc2, return true - if (!vc1.getReference().equals(vc2.getReference())) - return false; - - for (final Allele a :vc1.getAlternateAlleles()) { - if (!vc2.getAlternateAlleles().contains(a)) - return false; - } - - return true; - } - - public static Map> separateVariantContextsByType( final Collection VCs ) { - if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } - - final HashMap> mappedVCs = new HashMap<>(); - for ( final VariantContext vc : VCs ) { - VariantContext.Type vcType = vc.getType(); - - // look at previous variant contexts of different type. If: - // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list - // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) - // c) neither: do nothing, just add vc to its own list - boolean addtoOwnList = true; - for (final VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vcType)) - continue; - - if (!mappedVCs.containsKey(type)) - continue; - - List vcList = mappedVCs.get(type); - for (int k=0; k < vcList.size(); k++) { - VariantContext otherVC = vcList.get(k); - if (allelesAreSubset(otherVC,vc)) { - // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list - vcList.remove(k); - // avoid having empty lists - if (vcList.size() == 0) - mappedVCs.remove(type); - if ( !mappedVCs.containsKey(vcType) ) - mappedVCs.put(vcType, new ArrayList()); - mappedVCs.get(vcType).add(otherVC); - break; - } - else if (allelesAreSubset(vc,otherVC)) { - // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own - mappedVCs.get(type).add(vc); - addtoOwnList = false; - break; - } - } - } - if (addtoOwnList) { - if ( !mappedVCs.containsKey(vcType) ) - mappedVCs.put(vcType, new ArrayList()); - mappedVCs.get(vcType).add(vc); - } - } - - return mappedVCs; - } - - public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { - if ( allowedAttributes == null ) - return vc; - - final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - final Map attrs = new HashMap<>(); - for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { - if ( allowedAttributes.contains(attr.getKey()) ) - attrs.put(attr.getKey(), attr.getValue()); - } - newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); - } - - return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); - } - - protected static class AlleleMapper { - private VariantContext vc = null; - private Map map = null; - public AlleleMapper(VariantContext vc) { this.vc = vc; } - public AlleleMapper(Map map) { this.map = map; } - public boolean needsRemapping() { return this.map != null; } - public Collection values() { return map != null ? map.values() : vc.getAlleles(); } - public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } - - public List remap(List as) { - List newAs = new ArrayList<>(); - for ( final Allele a : as ) { - //System.out.printf(" Remapping %s => %s%n", a, remap(a)); - newAs.add(remap(a)); - } - return newAs; - } - - /** - * @return the list of unique values - */ - public List getUniqueMappedAlleles() { - if ( map == null ) - return Collections.emptyList(); - return new ArrayList<>(new HashSet<>(map.values())); - } - } - - private static class CompareByPriority implements Comparator, Serializable { - List priorityListOfVCs; - public CompareByPriority(List priorityListOfVCs) { - this.priorityListOfVCs = priorityListOfVCs; - } - - private int getIndex(VariantContext vc) { - int i = priorityListOfVCs.indexOf(vc.getSource()); - if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); - return i; - } - - public int compare(VariantContext vc1, VariantContext vc2) { - return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); - } - } - - /** - * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles - * - * @param name the name of the VC - * @param contig the contig for the VC - * @param start the start of the VC - * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the - * alt. Will compute the stop of the VC from the length of the reference allele - * @return a non-null VariantContext - */ - public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List alleleStrings) { - if ( alleleStrings == null || alleleStrings.isEmpty() ) - throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); - - final List alleles = new LinkedList<>(); - final int length = alleleStrings.get(0).length(); - - boolean first = true; - for ( final String alleleString : alleleStrings ) { - alleles.add(Allele.create(alleleString, first)); - first = false; - } - return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); - } - - /** - * Splits the alleles for the provided variant context into its primitive parts. - * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. - * Currently works only for MNPs. - * - * @param vc the non-null VC to split - * @return a non-empty list of VCs split into primitive parts or the original VC otherwise - */ - public static List splitIntoPrimitiveAlleles(final VariantContext vc) { - if ( vc == null ) - throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); - - if ( !vc.isBiallelic() ) - throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); - - // currently only works for MNPs - if ( !vc.isMNP() ) - return Arrays.asList(vc); - - final byte[] ref = vc.getReference().getBases(); - final byte[] alt = vc.getAlternateAllele(0).getBases(); - - if ( ref.length != alt.length ) - throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); - - final List result = new ArrayList<>(ref.length); - - for ( int i = 0; i < ref.length; i++ ) { - - // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) - if ( ref[i] != alt[i] ) { - - // create the ref and alt SNP alleles - final Allele newRefAllele = Allele.create(ref[i], true); - final Allele newAltAllele = Allele.create(alt[i], false); - - // create a new VariantContext with the new SNP alleles - final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); - - // create new genotypes with updated alleles - final Map alleleMap = new HashMap<>(); - alleleMap.put(vc.getReference(), newRefAllele); - alleleMap.put(vc.getAlternateAllele(0), newAltAllele); - final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); - - result.add(newVC.genotypes(newGenotypes).make()); - } - } - - if ( result.isEmpty() ) - result.add(vc); - - return result; - } - - /** - * Are vc1 and 2 equal including their position and alleles? - * @param vc1 non-null VariantContext - * @param vc2 non-null VariantContext - * @return true if vc1 and vc2 are equal, false otherwise - */ - public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { - if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); - if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); - - if ( vc1.getStart() != vc2.getStart() ) return false; - if ( vc1.getEnd() != vc2.getEnd() ) return false; - if ( ! vc1.getChr().equals(vc2.getChr())) return false; - if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; - return true; - } - - /** - * Returns the absolute 0-based index of an allele. - * - *

- * If the allele is equal to the reference, the result is 0, if it equal to the first alternative the result is 1 - * and so forth. - *

- * Therefore if you want the 0-based index within the alternative alleles you need to do the following: - * - *

- * You can indicate whether the Java object reference comparator {@code ==} can be safelly used by setting {@code useEquals} to {@code false}. - * - * @param vc the target variant context. - * @param allele the target allele. - * @param ignoreRefState whether the reference states of the allele is important at all. Has no effect if {@code useEquals} is {@code false}. - * @param considerRefAllele whether the reference allele should be considered. You should set it to {@code false} if you are only interested in alternative alleles. - * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. - * - * @throws IllegalArgumentException if {@code allele} is {@code null}. - * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and {@link VariantContext#getNAlleles()} {@code -1} otherwise. - */ - public static int indexOfAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele, final boolean useEquals) { - if (allele == null) throw new IllegalArgumentException(); - return useEquals ? indexOfEqualAllele(vc,allele,ignoreRefState,considerRefAllele) : indexOfSameAllele(vc,allele,considerRefAllele); - } - - /** - * Returns the relative 0-based index of an alternative allele. - *

- * The the query allele is the same as the first alternative allele, the result is 0, - * if it is equal to the second 1 and so forth. - * - * - *

- * Notice that the ref-status of the query {@code allele} is ignored. - * - * @param vc the target variant context. - * @param allele the query allele. - * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. - * - * @throws IllegalArgumentException if {@code allele} is {@code null}. - * - * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and the number - * of alternative alleles - 1. - */ - public static int indexOfAltAllele(final VariantContext vc, final Allele allele, final boolean useEquals) { - final int absoluteIndex = indexOfAllele(vc,allele,true,false,useEquals); - return absoluteIndex == -1 ? -1 : absoluteIndex - 1; - } - - // Impements index search using equals. - private static int indexOfEqualAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, - final boolean considerRefAllele) { - int i = 0; - for (final Allele a : vc.getAlleles()) - if (a.equals(allele,ignoreRefState)) - return i == 0 ? (considerRefAllele ? 0 : -1) : i; - else - i++; - return -1; - } - - // Implements index search using ==. - private static int indexOfSameAllele(final VariantContext vc, final Allele allele, final boolean considerRefAllele) { - int i = 0; - - for (final Allele a : vc.getAlleles()) - if (a == allele) - return i == 0 ? (considerRefAllele ? 0 : -1) : i; - else - i++; - - return -1; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java deleted file mode 100644 index 6596cf324..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/EngineFeaturesIntegrationTest.java +++ /dev/null @@ -1,736 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import htsjdk.tribble.readers.LineIterator; -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.commandline.*; -import org.broadinstitute.gatk.engine.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource; -import org.broadinstitute.gatk.engine.filters.MappingQualityUnavailableFilter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrack; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.walkers.ReadFilters; -import org.broadinstitute.gatk.engine.walkers.ReadWalker; -import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.tools.walkers.qc.ErrorThrowing; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.*; -import java.util.*; - -/** - * - */ -public class EngineFeaturesIntegrationTest extends WalkerTest { - private void testBadRODBindingInput(String type, String name, Class c) { - WalkerTestSpec spec = new WalkerTestSpec("-T SelectVariants -L 1:1 --variant:variant," + type + " " - + b37dbSNP132 + " -R " + b37KGReference + " -o %s", - 1, c); - executeTest(name, spec); - } - - @Test() private void testBadRODBindingInputType1() { - testBadRODBindingInput("beagle", "BEAGLE input to VCF expecting walker", UserException.BadArgumentValue.class); - } - - @Test() private void testBadRODBindingInputType3() { - testBadRODBindingInput("bed", "Bed input to VCF expecting walker", UserException.BadArgumentValue.class); - } - - @Test() private void testBadRODBindingInputTypeUnknownType() { - testBadRODBindingInput("bedXXX", "Unknown input to VCF expecting walker", UserException.UnknownTribbleType.class); - } - - private void testMissingFile(String name, String missingBinding) { - WalkerTestSpec spec = new WalkerTestSpec(missingBinding + " -R " + b37KGReference + " -o %s", - 1, UserException.CouldNotReadInputFile.class); - executeTest(name, spec); - } - - @Test() private void testMissingBAMnt1() { - testMissingFile("missing BAM", "-T PrintReads -I missing.bam -nt 1"); - } - @Test() private void testMissingBAMnt4() { - testMissingFile("missing BAM", "-T PrintReads -I missing.bam -nt 4"); - } - @Test() private void testMissingVCF() { - testMissingFile("missing VCF", "-T SelectVariants -V missing.vcf"); - } - @Test() private void testMissingInterval() { - testMissingFile("missing interval", "-T PrintReads -L missing.interval_list -I " + b37GoodBAM); - } - - - // -------------------------------------------------------------------------------- - // - // Test that our exceptions are coming back as we expect - // - // -------------------------------------------------------------------------------- - - private class EngineErrorHandlingTestProvider extends TestDataProvider { - final Class expectedException; - final String args; - final int iterationsToTest; - - public EngineErrorHandlingTestProvider(Class exceptedException, final String args) { - super(EngineErrorHandlingTestProvider.class); - this.expectedException = exceptedException; - this.args = args; - this.iterationsToTest = args.equals("") ? 1 : 10; - setName(String.format("Engine error handling: expected %s with args %s", exceptedException, args)); - } - } - - @DataProvider(name = "EngineErrorHandlingTestProvider") - public Object[][] makeEngineErrorHandlingTestProvider() { - for ( final ErrorThrowing.FailMethod failMethod : ErrorThrowing.FailMethod.values() ) { - if ( failMethod == ErrorThrowing.FailMethod.TREE_REDUCE ) - continue; // cannot reliably throw errors in TREE_REDUCE - - final String failArg = " -fail " + failMethod.name(); - for ( final String args : Arrays.asList("", " -nt 2", " -nct 2") ) { - new EngineErrorHandlingTestProvider(NullPointerException.class, failArg + args); - new EngineErrorHandlingTestProvider(UserException.class, failArg + args); - new EngineErrorHandlingTestProvider(ReviewedGATKException.class, failArg + args); - } - } - - return EngineErrorHandlingTestProvider.getTests(EngineErrorHandlingTestProvider.class); - } - - // - // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type - // - @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 60 * 1000 ) - public void testEngineErrorHandlingTestProvider(final EngineErrorHandlingTestProvider cfg) { - for ( int i = 0; i < cfg.iterationsToTest; i++ ) { - final String root = "-T ErrorThrowing -R " + exampleFASTA; - final String args = root + cfg.args + " -E " + cfg.expectedException.getSimpleName(); - WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); - - executeTest(cfg.toString(), spec); - } - } - - // -------------------------------------------------------------------------------- - // - // Test that read filters are being applied in the order we expect - // - // -------------------------------------------------------------------------------- - - @ReadFilters({MappingQualityUnavailableFilter.class}) - public static class DummyReadWalkerWithMapqUnavailableFilter extends ReadWalker { - @Output - PrintStream out; - - @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - return 1; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - @Override - public void onTraversalDone(Integer result) { - out.println(result); - } - } - - @Test(enabled = true) - public void testUserReadFilterAppliedBeforeWalker() { - WalkerTestSpec spec = new WalkerTestSpec("-R " + b37KGReference + " -I " + privateTestDir + "allMAPQ255.bam" - + " -T DummyReadWalkerWithMapqUnavailableFilter -o %s -L MT -rf ReassignMappingQuality", - 1, Arrays.asList("ecf27a776cdfc771defab1c5d19de9ab")); - executeTest("testUserReadFilterAppliedBeforeWalker", spec); - } - - @Test - public void testNegativeCompress() { - testBadCompressArgument(-1); - } - - @Test - public void testTooBigCompress() { - testBadCompressArgument(100); - } - - private void testBadCompressArgument(final int compress) { - WalkerTestSpec spec = new WalkerTestSpec("-T PrintReads -R " + b37KGReference + " -I " + privateTestDir + "NA12878.1_10mb_2_10mb.bam -o %s -compress " + compress, - 1, UserException.class); - executeTest("badCompress " + compress, spec); - } - - // -------------------------------------------------------------------------------- - // - // Test that the VCF version key is what we expect - // - // -------------------------------------------------------------------------------- - @Test(enabled = true) - public void testGATKVersionInVCF() throws Exception { - WalkerTestSpec spec = new WalkerTestSpec("-T SelectVariants -R " + b37KGReference + - " -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf" - + " -o %s -L 20:61098", - 1, Arrays.asList("")); - spec.disableShadowBCF(); - final File vcf = executeTest("testGATKVersionInVCF", spec).first.get(0); - final VCFCodec codec = new VCFCodec(); - final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); - final VCFHeaderLine versionLine = header.getMetaDataLine(GATKVCFUtils.GATK_COMMAND_LINE_KEY); - Assert.assertNotNull(versionLine); - Assert.assertTrue(versionLine.toString().contains("SelectVariants")); - } - - @Test(enabled = true) - public void testMultipleGATKVersionsInVCF() throws Exception { - WalkerTestSpec spec = new WalkerTestSpec("-T SelectVariants -R " + b37KGReference + - " -V " + privateTestDir + "gatkCommandLineInHeader.vcf" - + " -o %s", - 1, Arrays.asList("")); - spec.disableShadowBCF(); - final File vcf = executeTest("testMultipleGATKVersionsInVCF", spec).first.get(0); - final VCFCodec codec = new VCFCodec(); - final VCFHeader header = (VCFHeader) codec.readActualHeader(codec.makeSourceFromStream(new FileInputStream(vcf))); - - boolean foundHC = false; - boolean foundSV = false; - for ( final VCFHeaderLine line : header.getMetaDataInInputOrder() ) { - if ( line.getKey().equals(GATKVCFUtils.GATK_COMMAND_LINE_KEY) ) { - if ( line.toString().contains("HaplotypeCaller") ) { - Assert.assertFalse(foundHC); - foundHC = true; - } - if ( line.toString().contains("SelectVariants") ) { - Assert.assertFalse(foundSV); - foundSV = true; - } - } - } - - Assert.assertTrue(foundHC, "Didn't find HaplotypeCaller command line header field"); - Assert.assertTrue(foundSV, "Didn't find SelectVariants command line header field"); - } - - // -------------------------------------------------------------------------------- - // - // Test that defaultBaseQualities actually works - // - // -------------------------------------------------------------------------------- - - public WalkerTestSpec testDefaultBaseQualities(final Integer value, final String md5) { - return new WalkerTestSpec("-T PrintReads -R " + b37KGReference + " -I " + privateTestDir + "/baseQualitiesToFix.bam -o %s" - + (value != null ? " --defaultBaseQualities " + value : ""), - 1, Arrays.asList(md5)); - } - - @Test() - public void testDefaultBaseQualities20() { - executeTest("testDefaultBaseQualities20", testDefaultBaseQualities(20, "7d254a9d0ec59c66ee3e137f56f4c78f")); - } - - @Test() - public void testDefaultBaseQualities30() { - executeTest("testDefaultBaseQualities30", testDefaultBaseQualities(30, "0f50def6cbbbd8ccd4739e2b3998e503")); - } - - @Test(expectedExceptions = Exception.class) - public void testDefaultBaseQualitiesNoneProvided() { - executeTest("testDefaultBaseQualitiesNoneProvided", testDefaultBaseQualities(null, "")); - } - - // -------------------------------------------------------------------------------- - // - // Test engine-level cigar consolidation - // - // -------------------------------------------------------------------------------- - - @Test - public void testGATKEngineConsolidatesCigars() { - final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + - " -R " + b37KGReference + - " -I " + privateTestDir + "zero_length_cigar_elements.bam" + - " -o %s", - 1, Arrays.asList("")); // No MD5s; we only want to check the cigar - - final File outputBam = executeTest("testGATKEngineConsolidatesCigars", spec).first.get(0); - final SAMFileReader reader = new SAMFileReader(outputBam); - reader.setValidationStringency(ValidationStringency.SILENT); - reader.setSAMRecordFactory(new GATKSamRecordFactory()); - - final SAMRecord read = reader.iterator().next(); - reader.close(); - - // Original cigar was 0M3M0M8M. Check that it's been consolidated after running through the GATK engine: - Assert.assertEquals(read.getCigarString(), "11M", "Cigar 0M3M0M8M not consolidated correctly by the engine"); - } - - // -------------------------------------------------------------------------------- - // - // Test on-the-fly sample renaming - // - // -------------------------------------------------------------------------------- - - // On-the-fly sample renaming test case: one single-sample bam with multiple read groups - @Test - public void testOnTheFlySampleRenamingWithSingleBamFile() throws IOException { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam myNewSampleName")); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + - " -R " + b37KGReference + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, Arrays.asList("")); // No MD5s; we only want to check the read groups - - final File outputBam = executeTest("testOnTheFlySampleRenamingWithSingleBamFile", spec).first.get(0); - final SAMFileReader reader = new SAMFileReader(outputBam); - - for ( final SAMReadGroupRecord readGroup : reader.getFileHeader().getReadGroups() ) { - Assert.assertEquals(readGroup.getSample(), "myNewSampleName", String.format("Sample for read group %s not renamed correctly", readGroup.getId())); - } - - reader.close(); - } - - // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam - @Test - public void testOnTheFlySampleRenamingWithMultipleBamFiles() throws IOException { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", - privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam newSampleFor12891", - privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); - - final Map readGroupToNewSampleMap = new HashMap<>(); - for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { - final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); - final SAMFileReader inputBamReader = new SAMFileReader(inputBam); - final String newSampleName = String.format("newSampleFor%s", inputBamID); - for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { - readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); - } - inputBamReader.close(); - } - - final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + - " -R " + b37KGReference + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, Arrays.asList("")); // No MD5s; we only want to check the read groups - - final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFiles", spec).first.get(0); - final SAMFileReader outputBamReader = new SAMFileReader(outputBam); - - int totalReadGroupsSeen = 0; - for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { - Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), - String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); - totalReadGroupsSeen++; - } - - Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); - - outputBamReader.close(); - } - - // On-the-fly sample renaming test case: three single-sample bams with multiple read groups per bam, - // performing renaming in only SOME of the bams - @Test - public void testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename() throws IOException { - // Rename samples for NA12878 and NA12892, but not for NA12891 - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", - privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam newSampleFor12892")); - - final Map readGroupToNewSampleMap = new HashMap<>(); - for ( String inputBamID : Arrays.asList("12878", "12891", "12892") ) { - final File inputBam = new File(privateTestDir + String.format("CEUTrio.HiSeq.WGS.b37.NA%s.HEADERONLY.bam", inputBamID)); - final SAMFileReader inputBamReader = new SAMFileReader(inputBam); - - // Special-case NA12891, which we're not renaming: - final String newSampleName = inputBamID.equals("12891") ? "NA12891" : String.format("newSampleFor%s", inputBamID); - - for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { - readGroupToNewSampleMap.put(readGroup.getId(), newSampleName); - } - inputBamReader.close(); - } - - final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + - " -R " + b37KGReference + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12891.HEADERONLY.bam" + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12892.HEADERONLY.bam" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, Arrays.asList("")); // No MD5s; we only want to check the read groups - - final File outputBam = executeTest("testOnTheFlySampleRenamingWithMultipleBamFilesPartialRename", spec).first.get(0); - final SAMFileReader outputBamReader = new SAMFileReader(outputBam); - - int totalReadGroupsSeen = 0; - for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { - Assert.assertEquals(readGroup.getSample(), readGroupToNewSampleMap.get(readGroup.getId()), - String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); - totalReadGroupsSeen++; - } - - Assert.assertEquals(totalReadGroupsSeen, readGroupToNewSampleMap.size(), "Wrong number of read groups encountered in output bam file"); - - outputBamReader.close(); - } - - // On-the-fly sample renaming test case: two single-sample bams with read group collisions - @Test - public void testOnTheFlySampleRenamingWithReadGroupCollisions() throws IOException { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam newSampleFor12878", - privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam newSampleForNot12878")); - - final Set na12878ReadGroups = new HashSet<>(); - final SAMFileReader inputBamReader = new SAMFileReader(new File(privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam")); - for ( final SAMReadGroupRecord readGroup : inputBamReader.getFileHeader().getReadGroups() ) { - na12878ReadGroups.add(readGroup.getId()); - } - inputBamReader.close(); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + - " -R " + b37KGReference + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.HEADERONLY.bam" + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.READ_GROUP_COLLISIONS_WITH_NA12878.HEADERONLY.bam" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, Arrays.asList("")); // No MD5s; we only want to check the read groups - - final File outputBam = executeTest("testOnTheFlySampleRenamingWithReadGroupCollisions", spec).first.get(0); - final SAMFileReader outputBamReader = new SAMFileReader(outputBam); - - int totalReadGroupsSeen = 0; - for ( final SAMReadGroupRecord readGroup : outputBamReader.getFileHeader().getReadGroups() ) { - String expectedSampleName = ""; - if ( na12878ReadGroups.contains(readGroup.getId()) ) { - expectedSampleName = "newSampleFor12878"; - } - else { - expectedSampleName = "newSampleForNot12878"; - } - - Assert.assertEquals(readGroup.getSample(), expectedSampleName, - String.format("Wrong sample for read group %s after on-the-fly renaming", readGroup.getId())); - totalReadGroupsSeen++; - } - - Assert.assertEquals(totalReadGroupsSeen, na12878ReadGroups.size() * 2, "Wrong number of read groups encountered in output bam file"); - - outputBamReader.close(); - } - - // On-the-fly sample renaming test case: a multi-sample bam (this should generate a UserException) - @Test - public void testOnTheFlySampleRenamingWithMultiSampleBam() throws IOException { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam myNewSampleName")); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T PrintReads" + - " -R " + b37KGReference + - " -I " + privateTestDir + "CEUTrio.HiSeq.WGS.b37.MERGED.HEADERONLY.bam" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, - UserException.class); // expecting a UserException here - - executeTest("testOnTheFlySampleRenamingWithMultiSampleBam", spec); - } - - // On-the-fly sample renaming test case: ensure that walkers can see the remapped sample names in individual reads - @Test - public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads() throws IOException { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam myNewSampleName")); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingTestWalker" + - " -R " + b37KGReference + - " -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " --newSampleName myNewSampleName" + - " -L 20:10000000-10001000", - 1, Arrays.asList("")); - - // Test is a success if our custom walker doesn't throw an exception - executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInReads", spec); - } - - @Test - public void testOnTheFlySampleRenamingSingleSampleVCF() throws IOException { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf newSampleForNA12878")); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T CombineVariants" + - " -R " + b37KGReference + - " -V " + privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, - Arrays.asList("")); // No MD5s -- we will inspect the output file manually - - final File outputVCF = executeTest("testOnTheFlySampleRenamingSingleSampleVCF", spec).first.get(0); - verifySampleRenaming(outputVCF, "newSampleForNA12878"); - } - - private void verifySampleRenaming( final File outputVCF, final String newSampleName ) throws IOException { - final Pair> headerAndVCIter = GATKVCFUtils.readAllVCs(outputVCF, new VCFCodec()); - final VCFHeader header = headerAndVCIter.getFirst(); - final GATKVCFUtils.VCIterable iter = headerAndVCIter.getSecond(); - - // Verify that sample renaming occurred at both the header and record levels (checking only the first 10 records): - - Assert.assertEquals(header.getGenotypeSamples().size(), 1, "Wrong number of samples in output vcf header"); - Assert.assertEquals(header.getGenotypeSamples().get(0), newSampleName, "Wrong sample name in output vcf header"); - - int recordCount = 0; - while ( iter.hasNext() && recordCount < 10 ) { - final VariantContext vcfRecord = iter.next(); - Assert.assertEquals(vcfRecord.getSampleNames().size(), 1, "Wrong number of samples in output vcf record"); - Assert.assertEquals(vcfRecord.getSampleNames().iterator().next(), newSampleName, "Wrong sample name in output vcf record"); - recordCount++; - } - } - - @Test - public void testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInVCFRecords() throws Exception { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "samplerenametest_single_sample_gvcf.vcf FOOSAMPLE")); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T OnTheFlySampleRenamingVerifyingRodWalker" + - " -R " + hg19Reference + - " -V " + privateTestDir + "samplerenametest_single_sample_gvcf.vcf" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " --expectedSampleName FOOSAMPLE" + - " -o %s", - 1, - Arrays.asList("")); // No MD5s -- custom walker will throw an exception if there's a problem - - executeTest("testOnTheFlySampleRenamingVerifyWalkerSeesNewSamplesInVCFRecords", spec); - } - - @Test - public void testOnTheFlySampleRenamingMultiSampleVCF() throws Exception { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "vcf/vcfWithGenotypes.vcf badSample")); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T CombineVariants" + - " -R " + b37KGReference + - " -V " + privateTestDir + "vcf/vcfWithGenotypes.vcf" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, - UserException.class); // expecting a UserException here - - executeTest("testOnTheFlySampleRenamingMultiSampleVCF", spec); - } - - @Test - public void testOnTheFlySampleRenamingSitesOnlyVCF() throws Exception { - final File sampleRenameMapFile = createTestSampleRenameMapFile( - Arrays.asList(privateTestDir + "vcf/vcfWithoutGenotypes.vcf badSample")); - - final WalkerTestSpec spec = new WalkerTestSpec(" -T CombineVariants" + - " -R " + b37KGReference + - " -V " + privateTestDir + "vcf/vcfWithoutGenotypes.vcf" + - " --sample_rename_mapping_file " + sampleRenameMapFile.getAbsolutePath() + - " -o %s", - 1, - UserException.class); // expecting a UserException here - - executeTest("testOnTheFlySampleRenamingSitesOnlyVCF", spec); - } - - private File createTestSampleRenameMapFile( final List contents ) throws IOException { - final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); - final PrintWriter writer = new PrintWriter(mapFile); - - for ( final String line : contents ) { - writer.println(line); - } - writer.close(); - - return mapFile; - } - - public static class OnTheFlySampleRenamingVerifyingTestWalker extends ReadWalker { - @Argument(fullName = "newSampleName", shortName = "newSampleName", doc = "", required = true) - String newSampleName = null; - - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - if ( ! newSampleName.equals(read.getReadGroup().getSample()) ) { - throw new IllegalStateException(String.format("Encountered read with the wrong sample name. Expected %s found %s", - newSampleName, read.getReadGroup().getSample())); - } - - return 1; - } - - public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { return value + sum; } - } - - public static class OnTheFlySampleRenamingVerifyingRodWalker extends RodWalker { - @Argument(fullName = "expectedSampleName", shortName = "expectedSampleName", doc = "", required = true) - String expectedSampleName = null; - - @Output - PrintStream out; - - @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) - public RodBinding variants; - - public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - if ( tracker == null ) { - return 0; - } - - for ( final VariantContext vc : tracker.getValues(variants, context.getLocation()) ) { - if ( vc.getSampleNames().size() != 1 ) { - throw new IllegalStateException("Encountered a vcf record with num samples != 1"); - } - - final String actualSampleName = vc.getSampleNames().iterator().next(); - if ( ! expectedSampleName.equals(actualSampleName)) { - throw new IllegalStateException(String.format("Encountered vcf record with wrong sample name. Expected %s found %s", - expectedSampleName, actualSampleName)); - } - } - - return 1; - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - } - - // -------------------------------------------------------------------------------- - // - // Test output file-specific options - // - // -------------------------------------------------------------------------------- - - //Returns the output file - private File testBAMFeatures(final String args, final String md5) { - WalkerTestSpec spec = new WalkerTestSpec("-T PrintReads -R " + b37KGReference + - " -I " + privateTestDir + "NA20313.highCoverageRegion.bam" - + " --no_pg_tag -o %s " + args, - 1, Arrays.asList(".bam"), Arrays.asList(md5)); - return executeTest("testBAMFeatures: "+args, spec).first.get(0); - } - - @Test - public void testSAMWriterFeatures() { - testBAMFeatures("-compress 0", "bb4b55b1f80423970bb9384cbf0d8793"); - testBAMFeatures("-compress 9", "b85ee1636d62e1bb8ed65a245c307167"); - testBAMFeatures("-simplifyBAM", "38f9c30a27dfbc085a2ff52a1617d579"); - - //Validate MD5 - final String expectedMD5 = "6627b9ea33293a0083983feb94948c1d"; - final File md5Target = testBAMFeatures("--generate_md5", expectedMD5); - final File md5File = new File(md5Target.getAbsoluteFile() + ".md5"); - md5File.deleteOnExit(); - Assert.assertTrue(md5File.exists(), "MD5 wasn't created"); - try { - String md5 = new BufferedReader(new FileReader(md5File)).readLine(); - Assert.assertEquals(md5, expectedMD5, "Generated MD5 doesn't match expected"); - } catch (IOException e) { - Assert.fail("Can't parse MD5 file", e); - } - - //Validate that index isn't created - final String unindexedBAM = testBAMFeatures("--disable_bam_indexing", expectedMD5).getAbsolutePath(); - Assert.assertTrue(!(new File(unindexedBAM+".bai").exists()) && - !(new File(unindexedBAM.replace(".bam", ".bai")).exists()), - "BAM index was created even though it was disabled"); - } - - private void testVCFFeatures(final String args, final String md5) { - WalkerTestSpec spec = new WalkerTestSpec("-T SelectVariants -R " + b37KGReference + - " -V " + privateTestDir + "CEUtrioTest.vcf" - + " --no_cmdline_in_header -o %s " + args, - 1, Arrays.asList(md5)); - executeTest("testVCFFeatures: "+args, spec); - } - - private void testVCFFormatHandling(final boolean writeFullFormat, final String md5) { - WalkerTestSpec spec = new WalkerTestSpec("-T SelectVariants -R " + b37KGReference + - " -V " + privateTestDir + "ILLUMINA.wex.broad_phase2_baseline.20111114.both.exome.genotypes.1000.vcf" - + " --no_cmdline_in_header -o %s " - + " --fullyDecode " //Without this parameter, the FORMAT fields will be emitted unchanged. Oops - + (writeFullFormat ? "-writeFullFormat" : "") , - 1, Arrays.asList(md5)); - executeTest("testVCFFormatHandling: "+(writeFullFormat ? "Untrimmed" : "Trimmed"), spec); - } - - @Test - public void testVCFWriterFeatures() { - testVCFFeatures("--sites_only", "94bf1f2c0946e933515e4322323a5716"); - testVCFFeatures("--bcf", "03f2d6988f54a332da48803c78f9c4b3"); - testVCFFormatHandling(true, "2b0fa660b0cef4b0f45a10febb453b6c"); - testVCFFormatHandling(false, "5960311fdd9ee6db88587efaaf4055a0"); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java deleted file mode 100644 index ff60ae30c..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/GenomeAnalysisEngineUnitTest.java +++ /dev/null @@ -1,273 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.tools.walkers.qc.CountReads; -import org.broadinstitute.gatk.tools.walkers.readutils.PrintReads; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.*; - -/** - * Tests selected functionality in the GenomeAnalysisEngine class - */ -public class GenomeAnalysisEngineUnitTest extends BaseTest { - - @Test(expectedExceptions=UserException.class) - public void testEmptySamFileListHandling() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - testEngine.setWalker(new CountReads()); //generalizable to any walker requiring reads - - //supply command line args so validateSuppliedReads() knows whether reads were passed in - GATKArgumentCollection testArgs = new GATKArgumentCollection(); - testArgs.samFiles.add("empty.list"); - testEngine.setArguments(testArgs); - - //represents the empty list of samFiles read in from empty.list by CommandLineExecutable - Collection samFiles = new ArrayList(); - - testEngine.setSAMFileIDs(samFiles); - testEngine.validateSuppliedReads(); - } - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingSingleDuplicate() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingMultipleDuplicates() throws Exception { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleNORG.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File(publicTestDir + "exampleNORG.bam"), new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test(expectedExceptions=UserException.class) - public void testDuplicateSamFileHandlingAbsoluteVsRelativePath() { - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - final File relativePathToBAMFile = new File(publicTestDir + "exampleBAM.bam"); - final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); - Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(relativePathToBAMFile, new Tags())); - samFiles.add(new SAMReaderID(absolutePathToBAMFile, new Tags())); - - testEngine.setSAMFileIDs(samFiles); - testEngine.checkForDuplicateSamFiles(); - } - - @Test - public void testEmptyIntervalSetHandling() throws Exception { - GenomeLocParser genomeLocParser = new GenomeLocParser(ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000).getSequenceDictionary()); - - GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - - testEngine.setWalker(new PrintReads()); - testEngine.setIntervals(new GenomeLocSortedSet(genomeLocParser)); - - testEngine.validateSuppliedIntervals(); - } - - @Test - public void testLoadWellFormedSampleRenameMapFile() throws IOException { - final File mapFile = createTestSampleRenameMapFile(Arrays.asList("/foo/bar/first.bam newSample1", - "/foo/bar/second.bam newSample2", - "/foo/bar2/third.bam newSample3", - "/foo/bar2/fourth.bam new sample 4", - "/foo/bar2/fifth.bam new sample 5 ")); - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - final Map renameMap = engine.loadSampleRenameMap(mapFile); - - Assert.assertEquals(renameMap.size(), 5, "Sample rename map was wrong size after loading from file"); - - final Iterator expectedResultsIterator = Arrays.asList( - "/foo/bar/first.bam", "newSample1", - "/foo/bar/second.bam", "newSample2", - "/foo/bar2/third.bam", "newSample3", - "/foo/bar2/fourth.bam", "new sample 4", - "/foo/bar2/fifth.bam", "new sample 5" - ).iterator(); - while ( expectedResultsIterator.hasNext() ) { - final String expectedKey = expectedResultsIterator.next(); - final String expectedValue = expectedResultsIterator.next(); - - Assert.assertNotNull(renameMap.get(expectedKey), String.format("Entry for %s not found in sample rename map", expectedKey)); - Assert.assertEquals(renameMap.get(expectedKey), expectedValue, "Wrong value in sample rename map for " + expectedKey); - } - } - - @DataProvider(name = "MalformedSampleRenameMapFileDataProvider") - public Object[][] generateMalformedSampleRenameMapFiles() throws IOException { - final List tests = new ArrayList(); - - tests.add(new Object[]{"testLoadSampleRenameMapFileNonExistentFile", - new File("/foo/bar/nonexistent")}); - tests.add(new Object[]{"testLoadSampleRenameMapFileMalformedLine", - createTestSampleRenameMapFile(Arrays.asList("/path/to/foo.bam"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileNonAbsoluteBamPath", - createTestSampleRenameMapFile(Arrays.asList("relative/path/to/foo.bam newSample"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileDuplicateBamPath", - createTestSampleRenameMapFile(Arrays.asList("/path/to/dupe.bam newSample1", - "/path/to/dupe.bam newSample2"))}); - tests.add(new Object[]{"testLoadSampleRenameMapFileTabInSampleName", - createTestSampleRenameMapFile(Arrays.asList("/path/to/stuff.bam some wonky\tsample "))}); - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MalformedSampleRenameMapFileDataProvider", expectedExceptions = UserException.class) - public void testLoadMalformedSampleRenameMapFile( final String testName, final File mapFile ) { - logger.info("Executing test " + testName); - - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - final Map renameMap = engine.loadSampleRenameMap(mapFile); - } - - private File createTestSampleRenameMapFile( final List contents ) throws IOException { - final File mapFile = createTempFile("TestSampleRenameMapFile", ".tmp"); - final PrintWriter writer = new PrintWriter(mapFile); - - for ( final String line : contents ) { - writer.println(line); - } - writer.close(); - - return mapFile; - } - - /////////////////////////////////////////////////// - // Test the ReadTransformer ordering enforcement // - /////////////////////////////////////////////////// - - public static class TestReadTransformer extends ReadTransformer { - - private OrderingConstraint orderingConstraint = OrderingConstraint.DO_NOT_CARE; - private boolean enabled; - - protected TestReadTransformer(final OrderingConstraint orderingConstraint) { - this.orderingConstraint = orderingConstraint; - enabled = true; - } - - // need this because PackageUtils will pick up this class as a possible ReadTransformer - protected TestReadTransformer() { - enabled = false; - } - - @Override - public OrderingConstraint getOrderingConstraint() { return orderingConstraint; } - - @Override - public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { return ApplicationTime.HANDLED_IN_WALKER; } - - @Override - public boolean enabled() { return enabled; } - - @Override - public GATKSAMRecord apply(final GATKSAMRecord read) { return read; } - - } - - @DataProvider(name = "ReadTransformerData") - public Object[][] makeReadTransformerData() { - List tests = new ArrayList(); - - for ( final ReadTransformer.OrderingConstraint orderingConstraint1 : ReadTransformer.OrderingConstraint.values() ) { - for ( final ReadTransformer.OrderingConstraint orderingConstraint2 : ReadTransformer.OrderingConstraint.values() ) { - for ( final ReadTransformer.OrderingConstraint orderingConstraint3 : ReadTransformer.OrderingConstraint.values() ) { - tests.add(new Object[]{orderingConstraint1, orderingConstraint2, orderingConstraint3}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ReadTransformerData") - public void testReadTransformer(final ReadTransformer.OrderingConstraint oc1, final ReadTransformer.OrderingConstraint oc2, final ReadTransformer.OrderingConstraint oc3) { - - final GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); - final List readTransformers = new ArrayList(3); - readTransformers.add(new TestReadTransformer(oc1)); - readTransformers.add(new TestReadTransformer(oc2)); - readTransformers.add(new TestReadTransformer(oc3)); - - final boolean shouldThrowException = numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_FIRST, oc1, oc2, oc3) > 1 || - numWithConstraint(ReadTransformer.OrderingConstraint.MUST_BE_LAST, oc1, oc2, oc3) > 1; - - try { - testEngine.setReadTransformers(readTransformers); - - Assert.assertFalse(shouldThrowException); - Assert.assertEquals(testEngine.getReadTransformers().size(), 3); - - Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); - Assert.assertTrue(testEngine.getReadTransformers().get(2).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_FIRST); - Assert.assertTrue(testEngine.getReadTransformers().get(0).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); - Assert.assertTrue(testEngine.getReadTransformers().get(1).getOrderingConstraint() != ReadTransformer.OrderingConstraint.MUST_BE_LAST); - } catch (UserException.IncompatibleReadFiltersException e) { - Assert.assertTrue(shouldThrowException); - } - } - - private int numWithConstraint(final ReadTransformer.OrderingConstraint target, final ReadTransformer.OrderingConstraint... constraints ) { - int count = 0; - for ( final ReadTransformer.OrderingConstraint constraint : constraints ) { - if ( constraint == target ) - count++; - } - return count; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java deleted file mode 100644 index 27b6c1ca3..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/MaxRuntimeIntegrationTest.java +++ /dev/null @@ -1,151 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.commandline.Argument; -import org.broadinstitute.gatk.utils.commandline.Output; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.LocusWalker; -import org.broadinstitute.gatk.utils.SimpleTimer; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.concurrent.TimeUnit; - -/** - * - */ -public class MaxRuntimeIntegrationTest extends WalkerTest { - public static class SleepingWalker extends LocusWalker { - @Output PrintStream out; - - @Argument(fullName="sleepTime",shortName="sleepTime",doc="x", required=false) - public int sleepTime = 100; - - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - try {Thread.sleep(sleepTime);} catch (InterruptedException e) {}; - return 1; - } - - @Override public Integer reduceInit() { return 0; } - @Override public Integer reduce(Integer value, Integer sum) { return sum + value; } - - @Override - public void onTraversalDone(Integer result) { - out.println(result); - } - } - - private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(60, TimeUnit.SECONDS); - - private class MaxRuntimeTestProvider extends TestDataProvider { - final long maxRuntime; - final TimeUnit unit; - - public MaxRuntimeTestProvider(final long maxRuntime, final TimeUnit unit) { - super(MaxRuntimeTestProvider.class); - this.maxRuntime = maxRuntime; - this.unit = unit; - setName(String.format("Max runtime test : %d of %s", maxRuntime, unit)); - } - - public long expectedMaxRuntimeNano() { - return TimeUnit.NANOSECONDS.convert(maxRuntime, unit) + STARTUP_TIME; - } - } - - @DataProvider(name = "MaxRuntimeProvider") - public Object[][] makeMaxRuntimeProvider() { - for ( final TimeUnit requestedUnits : Arrays.asList(TimeUnit.NANOSECONDS, TimeUnit.MILLISECONDS, TimeUnit.SECONDS, TimeUnit.MINUTES) ) - new MaxRuntimeTestProvider(requestedUnits.convert(30, TimeUnit.SECONDS), requestedUnits); - - return MaxRuntimeTestProvider.getTests(MaxRuntimeTestProvider.class); - } - - // - // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type - // - @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 120 * 1000) - public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T PrintReads -R " + hg18Reference - + " -I " + validationDataLocation + "NA12878.WEx.downsampled20x.bam -o /dev/null" - + " -maxRuntime " + cfg.maxRuntime + " -maxRuntimeUnits " + cfg.unit, 0, - Collections.emptyList()); - final SimpleTimer timer = new SimpleTimer().start(); - executeTest("Max runtime " + cfg, spec); - final long actualRuntimeNano = timer.getElapsedTimeNano(); - - Assert.assertTrue(actualRuntimeNano < cfg.expectedMaxRuntimeNano(), - "Actual runtime " + TimeUnit.SECONDS.convert(actualRuntimeNano, TimeUnit.NANOSECONDS) - + " exceeded max. tolerated runtime " + TimeUnit.SECONDS.convert(cfg.expectedMaxRuntimeNano(), TimeUnit.NANOSECONDS) - + " given requested runtime " + cfg.maxRuntime + " " + cfg.unit); - } - - @DataProvider(name = "SubshardProvider") - public Object[][] makeSubshardProvider() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{10}); - tests.add(new Object[]{100}); - tests.add(new Object[]{500}); - tests.add(new Object[]{1000}); - tests.add(new Object[]{2000}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "SubshardProvider", timeOut = 120 * 1000) - public void testSubshardTimeout(final int sleepTime) throws Exception { - final int maxRuntime = 5000; - - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T SleepingWalker -R " + b37KGReference - + " -I " + privateTestDir + "NA12878.100kb.BQSRv2.example.bam -o %s" - + " -maxRuntime " + maxRuntime + " -maxRuntimeUnits MILLISECONDS -sleepTime " + sleepTime, 1, - Collections.singletonList("")); - final File result = executeTest("Subshard max runtime ", spec).getFirst().get(0); - final int cycle = Integer.valueOf(new BufferedReader(new FileReader(result)).readLine()); - - final int maxCycles = (int)Math.ceil((maxRuntime * 5) / sleepTime); - logger.warn(String.format("Max cycles %d saw %d in file %s with sleepTime %d and maxRuntime %d", maxCycles, cycle, result, sleepTime, maxRuntime)); - Assert.assertTrue(cycle < maxCycles, "Too many cycles seen -- saw " + cycle + " in file " + result + " but max should have been " + maxCycles); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java deleted file mode 100644 index 1153bccb8..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/ReadMetricsUnitTest.java +++ /dev/null @@ -1,371 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.reads.*; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.executive.WindowMaker; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.traversals.*; -import org.broadinstitute.gatk.engine.walkers.*; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.sam.*; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -public class ReadMetricsUnitTest extends BaseTest { - - @Test - public void testReadsSeenDoNotOverflowInt() { - - final ReadMetrics metrics = new ReadMetrics(); - - final long moreThanMaxInt = ((long)Integer.MAX_VALUE) + 1L; - - for ( long i = 0L; i < moreThanMaxInt; i++ ) { - metrics.incrementNumReadsSeen(); - } - - Assert.assertEquals(metrics.getNumReadsSeen(), moreThanMaxInt); - Assert.assertTrue(metrics.getNumReadsSeen() > (long) Integer.MAX_VALUE); - - logger.warn(String.format("%d %d %d", Integer.MAX_VALUE, moreThanMaxInt, Long.MAX_VALUE)); - } - - - // Test the accuracy of the read metrics - - private IndexedFastaSequenceFile reference; - private SAMSequenceDictionary dictionary; - private SAMFileHeader header; - private GATKSAMReadGroupRecord readGroup; - private GenomeLocParser genomeLocParser; - private File testBAM; - - private static final int numReadsPerContig = 250000; - private static final List contigs = Arrays.asList("1", "2", "3"); - - @BeforeClass - private void init() throws IOException { - reference = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - dictionary = reference.getSequenceDictionary(); - genomeLocParser = new GenomeLocParser(dictionary); - header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); - header.setSequenceDictionary(dictionary); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - readGroup = new GATKSAMReadGroupRecord(header.getReadGroup("test")); - - final List reads = new ArrayList<>(); - for ( final String contig : contigs ) { - for ( int i = 1; i <= numReadsPerContig; i++ ) { - reads.add(buildSAMRecord("read" + contig + "_" + i, contig, i)); - } - } - - createBAM(reads); - } - - private void createBAM(final List reads) throws IOException { - testBAM = createTempFile("TraverseActiveRegionsUnitTest", ".bam"); - - SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); - for (GATKSAMRecord read : reads ) { - out.addAlignment(read); - } - out.close(); - - new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); - new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); - } - - // copied from LocusViewTemplate - protected GATKSAMRecord buildSAMRecord(final String readName, final String contig, final int alignmentStart) { - GATKSAMRecord record = new GATKSAMRecord(header); - - record.setReadName(readName); - record.setReferenceIndex(dictionary.getSequenceIndex(contig)); - record.setAlignmentStart(alignmentStart); - - record.setCigarString("1M"); - record.setReadString("A"); - record.setBaseQualityString("A"); - record.setReadGroup(readGroup); - - return record; - } - - @Test - public void testCountsFromReadTraversal() { - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - - final Collection samFiles = new ArrayList<>(); - final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); - samFiles.add(readerID); - - final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, - false, - ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - new ArrayList(), - false, (byte)30, false, true, null, IntervalMergingRule.ALL); - - engine.setReadsDataSource(dataSource); - - final TraverseReadsNano traverseReadsNano = new TraverseReadsNano(1); - final DummyReadWalker walker = new DummyReadWalker(); - traverseReadsNano.initialize(engine, walker, null); - - for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()) ) { - final ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard, engine.getGenomeLocParser(), dataSource.seek(shard), reference, new ArrayList()); - traverseReadsNano.traverse(walker, dataProvider, 0); - dataProvider.close(); - } - - Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); - Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); - } - - @Test - public void testCountsFromLocusTraversal() { - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - - final Collection samFiles = new ArrayList<>(); - final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); - samFiles.add(readerID); - - final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, - false, - ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - new ArrayList(), - false, (byte)30, false, true, null, IntervalMergingRule.ALL); - - engine.setReadsDataSource(dataSource); - final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); - - final TraverseLociNano traverseLociNano = new TraverseLociNano(1); - final DummyLocusWalker walker = new DummyLocusWalker(); - traverseLociNano.initialize(engine, walker, null); - - for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new LocusShardBalancer()) ) { - final WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples); - for ( WindowMaker.WindowMakerIterator window : windowMaker ) { - final LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList()); - traverseLociNano.traverse(walker, dataProvider, 0); - dataProvider.close(); - } - windowMaker.close(); - } - - //dataSource.close(); - Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); - Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); - } - - @Test - public void testCountsFromActiveRegionTraversal() { - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - - final Collection samFiles = new ArrayList<>(); - final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); - samFiles.add(readerID); - - final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, - false, - ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - new ArrayList(), - false, (byte)30, false, true, null, IntervalMergingRule.ALL); - - engine.setReadsDataSource(dataSource); - final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); - - final List intervals = new ArrayList<>(contigs.size()); - for ( final String contig : contigs ) - intervals.add(genomeLocParser.createGenomeLoc(contig, 1, numReadsPerContig)); - - final TraverseActiveRegions traverseActiveRegions = new TraverseActiveRegions(); - final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - traverseActiveRegions.initialize(engine, walker, null); - - for ( final Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer()) ) { - final WindowMaker windowMaker = new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples); - for ( WindowMaker.WindowMakerIterator window : windowMaker ) { - final LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList()); - traverseActiveRegions.traverse(walker, dataProvider, 0); - dataProvider.close(); - } - windowMaker.close(); - } - - Assert.assertEquals(engine.getCumulativeMetrics().getNumReadsSeen(), contigs.size() * numReadsPerContig); - Assert.assertEquals(engine.getCumulativeMetrics().getNumIterations(), contigs.size() * numReadsPerContig); - } - - @Test - public void testFilteredCounts() { - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - - final Collection samFiles = new ArrayList<>(); - final SAMReaderID readerID = new SAMReaderID(testBAM, new Tags()); - samFiles.add(readerID); - - final List filters = new ArrayList<>(); - filters.add(new EveryTenthReadFilter()); - - final SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, - false, - ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - filters, - new ArrayList(), - false, (byte)30, false, true, null, IntervalMergingRule.ALL); - - engine.setReadsDataSource(dataSource); - - final TraverseReadsNano traverseReadsNano = new TraverseReadsNano(1); - final DummyReadWalker walker = new DummyReadWalker(); - traverseReadsNano.initialize(engine, walker, null); - - for ( final Shard shard : dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()) ) { - final ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard, engine.getGenomeLocParser(), dataSource.seek(shard), reference, new ArrayList()); - traverseReadsNano.traverse(walker, dataProvider, 0); - dataProvider.close(); - } - - Assert.assertEquals((long)engine.getCumulativeMetrics().getCountsByFilter().get(EveryTenthReadFilter.class.getSimpleName()), contigs.size() * numReadsPerContig / 10); - } - - class DummyLocusWalker extends LocusWalker { - @Override - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - - class DummyReadWalker extends ReadWalker { - @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - - class DummyActiveRegionWalker extends ActiveRegionWalker { - @Override - public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return new ActivityProfileState(ref.getLocus(), 0.0); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } - - private final class EveryTenthReadFilter extends ReadFilter { - - private int myCounter = 0; - - @Override - public boolean filterOut(final SAMRecord record) { - if ( ++myCounter == 10 ) { - myCounter = 0; - return true; - } - - return false; - } - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java deleted file mode 100644 index 62348ef19..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/WalkerManagerUnitTest.java +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine; - -import org.broadinstitute.gatk.utils.commandline.Hidden; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.tools.walkers.qc.CountLoci; -import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * Tests basic functionality of the walker manager. - */ -public class WalkerManagerUnitTest { - private static WalkerManager walkerManager; - - @BeforeClass - public void setUp() { - walkerManager = new WalkerManager(); - } - - @Test - public void testPresentWalker() { - Walker countLociWalker = walkerManager.createByName("CountLoci"); - Assert.assertEquals(CountLoci.class,countLociWalker.getClass()); - } - - @Test(expectedExceptions=UserException.class) - public void testAbsentWalker() { - walkerManager.createByName("Missing"); - } - - @Test(expectedExceptions=DynamicClassResolutionException.class) - public void testUninstantiableWalker() { - walkerManager.createByName("UninstantiableWalker"); - } -} - -@Hidden -class UninstantiableWalker extends Walker { - // Private constructor will generate uninstantiable message - private UninstantiableWalker() {} - public Long reduceInit() { return 0L; } - public Long reduce(Integer value, Long accum) { return 0L; } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java deleted file mode 100644 index f9d9dfe53..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/AllLocusViewUnitTest.java +++ /dev/null @@ -1,90 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - - -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; - -import java.util.List; -/** - * User: hanna - * Date: May 12, 2009 - * Time: 2:34:46 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Test the view of all loci. - */ -public class AllLocusViewUnitTest extends LocusViewTemplate { - - @Override - protected LocusView createView(LocusShardDataProvider provider) { - return new AllLocusView(provider); - } - - /** - * Test the reads according to an independently derived context. - * @param view - * @param range - * @param reads - */ - @Override - protected void testReadsInContext( LocusView view, List range, List reads ) { - AllLocusView allLocusView = (AllLocusView)view; - - // TODO: Should skip over loci not in the given range. - GenomeLoc firstLoc = range.get(0); - GenomeLoc lastLoc = range.get(range.size()-1); - GenomeLoc bounds = genomeLocParser.createGenomeLoc(firstLoc.getContig(),firstLoc.getStart(),lastLoc.getStop()); - - for( int i = bounds.getStart(); i <= bounds.getStop(); i++ ) { - GenomeLoc site = genomeLocParser.createGenomeLoc("chr1",i); - AlignmentContext locusContext = allLocusView.next(); - Assert.assertEquals(locusContext.getLocation(), site, "Locus context location is incorrect"); - int expectedReadsAtSite = 0; - - for( GATKSAMRecord read: reads ) { - if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) { - Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); - expectedReadsAtSite++; - } - } - - Assert.assertEquals(locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); - } - - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java deleted file mode 100644 index 8914a4876..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/CoveredLocusViewUnitTest.java +++ /dev/null @@ -1,103 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - - -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; - -import java.util.List; -/** - * User: hanna - * Date: May 12, 2009 - * Time: 2:34:46 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Test the CoveredLocusView. - */ -public class CoveredLocusViewUnitTest extends LocusViewTemplate { - - /** - * Retrieve a covered locus view. - */ - @Override - protected LocusView createView(LocusShardDataProvider provider) { - return new CoveredLocusView(provider); - } - - /** - * Test the reads according to an independently derived context. - * @param view - * @param range - * @param reads - */ - @Override - protected void testReadsInContext( LocusView view, List range, List reads ) { - CoveredLocusView coveredLocusView = (CoveredLocusView)view; - - // TODO: Should skip over loci not in the given range. - GenomeLoc firstLoc = range.get(0); - GenomeLoc lastLoc = range.get(range.size()-1); - GenomeLoc bounds = genomeLocParser.createGenomeLoc(firstLoc.getContig(),firstLoc.getStart(),lastLoc.getStop()); - - for( int i = bounds.getStart(); i <= bounds.getStop(); i++ ) { - GenomeLoc site = genomeLocParser.createGenomeLoc("chr1",i); - - int expectedReadsAtSite = 0; - for( GATKSAMRecord read: reads ) { - if( genomeLocParser.createGenomeLoc(read).containsP(site) ) - expectedReadsAtSite++; - } - - if( expectedReadsAtSite < 1 ) - continue; - - Assert.assertTrue(coveredLocusView.hasNext(),"Incorrect number of loci in view"); - - AlignmentContext locusContext = coveredLocusView.next(); - Assert.assertEquals(locusContext.getLocation(), site, "Target locus context location is incorrect"); - Assert.assertEquals(locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); - - for( GATKSAMRecord read: reads ) { - if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) - Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); - } - } - - Assert.assertFalse(coveredLocusView.hasNext(),"Iterator is not bounded at boundaries of shard"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java deleted file mode 100644 index 29ccbd644..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/IntervalReferenceOrderedViewUnitTest.java +++ /dev/null @@ -1,366 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.util.PeekableIterator; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.tribble.BasicFeature; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.engine.refdata.RODRecordListImpl; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * @author depristo - */ -public class IntervalReferenceOrderedViewUnitTest extends BaseTest { - private static int startingChr = 1; - private static int endingChr = 2; - private static int readCount = 100; - private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; - private static String contig; - private static SAMFileHeader header; - - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void beforeClass() { - header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - contig = header.getSequence(0).getSequenceName(); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - - initializeTests(); - } - - private class CompareFeatures implements Comparator { - @Override - public int compare(Feature o1, Feature o2) { - return genomeLocParser.createGenomeLoc(o1).compareTo(genomeLocParser.createGenomeLoc(o2)); - } - } - - private class ReadMetaDataTrackerRODStreamTest extends TestDataProvider { - final List allFeatures; - final List intervals; - - public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final GenomeLoc interval) { - this(allFeatures, Collections.singletonList(interval)); - } - - public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final List intervals) { - super(ReadMetaDataTrackerRODStreamTest.class); - this.allFeatures = new ArrayList(allFeatures); - Collections.sort(this.allFeatures, new CompareFeatures()); - this.intervals = new ArrayList(intervals); - Collections.sort(this.intervals); - setName(String.format("%s nFeatures %d intervals %s", getClass().getSimpleName(), allFeatures.size(), - intervals.size() == 1 ? intervals.get(0) : "size " + intervals.size())); - } - - public PeekableIterator getIterator(final String name) { - return new PeekableIterator(new TribbleIteratorFromCollection(name, genomeLocParser, allFeatures)); - } - - public Set getExpectedOverlaps(final GenomeLoc interval) { - final Set overlapping = new HashSet(); - for ( final Feature f : allFeatures ) - if ( genomeLocParser.createGenomeLoc(f).overlapsP(interval) ) - overlapping.add(f); - return overlapping; - } - } - - public void initializeTests() { - final List handPickedFeatures = new ArrayList(); - - handPickedFeatures.add(new BasicFeature(contig, 1, 1)); - handPickedFeatures.add(new BasicFeature(contig, 2, 5)); - handPickedFeatures.add(new BasicFeature(contig, 4, 4)); - handPickedFeatures.add(new BasicFeature(contig, 6, 6)); - handPickedFeatures.add(new BasicFeature(contig, 9, 10)); - handPickedFeatures.add(new BasicFeature(contig, 10, 10)); - handPickedFeatures.add(new BasicFeature(contig, 10, 11)); - handPickedFeatures.add(new BasicFeature(contig, 13, 20)); - - createTestsForFeatures(handPickedFeatures); - - // test in the present of a large spanning element - { - List oneLargeSpan = new ArrayList(handPickedFeatures); - oneLargeSpan.add(new BasicFeature(contig, 1, 30)); - createTestsForFeatures(oneLargeSpan); - } - - // test in the presence of a partially spanning element - { - List partialSpanStart = new ArrayList(handPickedFeatures); - partialSpanStart.add(new BasicFeature(contig, 1, 6)); - createTestsForFeatures(partialSpanStart); - } - - // test in the presence of a partially spanning element at the end - { - List partialSpanEnd = new ArrayList(handPickedFeatures); - partialSpanEnd.add(new BasicFeature(contig, 10, 30)); - createTestsForFeatures(partialSpanEnd); - } - - // no data at all - final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, 5, 5); - new ReadMetaDataTrackerRODStreamTest(Collections.emptyList(), loc); - } - - // -------------------------------------------------------------------------------- - // - // tests for the lower level IntervalOverlappingRODsFromStream - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "ReadMetaDataTrackerRODStreamTest") - public Object[][] createReadMetaDataTrackerRODStreamTest() { - return ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); - } - - private GenomeLoc span(final List features) { - int featuresStart = 1; for ( final GenomeLoc f : features ) featuresStart = Math.min(featuresStart, f.getStart()); - int featuresStop = 1; for ( final GenomeLoc f : features ) featuresStop = Math.max(featuresStop, f.getStop()); - return genomeLocParser.createGenomeLoc(contig, featuresStart, featuresStop); - } - - private void createTestsForFeatures(final List features) { - int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); - int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); - - for ( final int size : Arrays.asList(1, 5, 10, 100) ) { - final List allIntervals = new ArrayList(); - // regularly spaced - for ( int start = featuresStart; start < featuresStop; start++) { - final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); - allIntervals.add(loc); - new ReadMetaDataTrackerRODStreamTest(features, loc); - } - - // starting and stopping at every feature - for ( final Feature f : features ) { - // just at the feature - allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart(), f.getEnd())); - new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); - - // up to end - allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd())); - new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); - - // missing by 1 - allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() + 1, f.getEnd() + 1)); - new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); - - // just spanning - allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd() + 1)); - new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); - } - - new ReadMetaDataTrackerRODStreamTest(features, allIntervals); - } - } - - @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest") - public void runReadMetaDataTrackerRODStreamTest_singleQuery(final ReadMetaDataTrackerRODStreamTest data) { - if ( data.intervals.size() == 1 ) { - final String name = "testName"; - final PeekableIterator iterator = data.getIterator(name); - final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); - testRODStream(data, stream, Collections.singletonList(data.intervals.get(0))); - } - } - - @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_singleQuery") - public void runReadMetaDataTrackerRODStreamTest_multipleQueries(final ReadMetaDataTrackerRODStreamTest data) { - if ( data.intervals.size() > 1 ) { - final String name = "testName"; - final PeekableIterator iterator = data.getIterator(name); - final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); - testRODStream(data, stream, data.intervals); - } - } - - private void testRODStream(final ReadMetaDataTrackerRODStreamTest test, final IntervalOverlappingRODsFromStream stream, final List intervals) { - for ( final GenomeLoc interval : intervals ) { - final RODRecordList query = stream.getOverlapping(interval); - final HashSet queryFeatures = new HashSet(); - for ( final GATKFeature f : query ) queryFeatures.add((Feature)f.getUnderlyingObject()); - final Set overlaps = test.getExpectedOverlaps(interval); - - Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); - - BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); - } - } - - // -------------------------------------------------------------------------------- - // - // tests for the higher level tracker itself - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "ReadMetaDataTrackerTests") - public Object[][] createTrackerTests() { - List tests = new ArrayList(); - - final Object[][] singleTests = ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); - final List multiSiteTests = new ArrayList(); - for ( final Object[] singleTest : singleTests ) { - if ( ((ReadMetaDataTrackerRODStreamTest)singleTest[0]).intervals.size() > 1 ) - multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); - } - - for ( final boolean testStateless : Arrays.asList(true, false) ) { - // all pairwise tests - for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { - tests.add(new Object[]{singleTest, testStateless}); - } - - // all 3 way pairwise tests - //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { - // tests.add(new Object[]{singleTest, testStateless}); - //} - } - - logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") - public void runReadMetaDataTrackerTest(final List RODs, final boolean testStateless) { - final List names = new ArrayList(); - final List> iterators = new ArrayList>(); - final List intervals = new ArrayList(); - final List> rodBindings = new ArrayList>(); - - for ( int i = 0; i < RODs.size(); i++ ) { - final RodBinding rodBinding = new RodBinding(Feature.class, "name"+i); - rodBindings.add(rodBinding); - final String name = rodBinding.getName(); - names.add(name); - iterators.add(RODs.get(i).getIterator(name)); - intervals.addAll(RODs.get(i).intervals); - } - - Collections.sort(intervals); - final GenomeLoc span = span(intervals); - final IntervalReferenceOrderedView view = new IntervalReferenceOrderedView(genomeLocParser, span, names, iterators); - - if ( testStateless ) { - // test each tracker is well formed, as each is created - for ( final GenomeLoc interval : intervals ) { - final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); - testMetaDataTrackerBindings(tracker, interval, RODs, rodBindings); - } - } else { - // tests all trackers are correct after reading them into an array - // this checks that the trackers are be safely stored away and analyzed later (critical for nano-scheduling) - final List trackers = new ArrayList(); - for ( final GenomeLoc interval : intervals ) { - final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); - trackers.add(tracker); - } - - for ( int i = 0; i < trackers.size(); i++) { - testMetaDataTrackerBindings(trackers.get(i), intervals.get(i), RODs, rodBindings); - } - } - } - - private void testMetaDataTrackerBindings(final RefMetaDataTracker tracker, - final GenomeLoc interval, - final List RODs, - final List> rodBindings) { - for ( int i = 0; i < RODs.size(); i++ ) { - final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); - final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); - final Set queryFeatures = new HashSet(queryFeaturesList); - final Set overlaps = test.getExpectedOverlaps(interval); - - Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); - - BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + - " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); - } - } - - static class TribbleIteratorFromCollection implements Iterator { - // current location - private final String name; - final Queue gatkFeatures; - - public TribbleIteratorFromCollection(final String name, final GenomeLocParser genomeLocParser, final List features) { - this.name = name; - - this.gatkFeatures = new LinkedList(); - for ( final Feature f : features ) - gatkFeatures.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); - } - - @Override - public boolean hasNext() { - return ! gatkFeatures.isEmpty(); - } - - @Override - public RODRecordList next() { - final GATKFeature first = gatkFeatures.poll(); - final Collection myFeatures = new LinkedList(); - myFeatures.add(first); - while ( gatkFeatures.peek() != null && gatkFeatures.peek().getLocation().getStart() == first.getStart() ) - myFeatures.add(gatkFeatures.poll()); - - GenomeLoc loc = first.getLocation(); - for ( final GATKFeature feature : myFeatures ) - loc = loc.merge(feature.getLocation()); - - return new RODRecordListImpl(name, myFeatures, loc); // is this safe? - } - - @Override public void remove() { throw new IllegalStateException("GRRR"); } - } -} - - diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java deleted file mode 100644 index 5eb9c7ac7..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusReferenceViewUnitTest.java +++ /dev/null @@ -1,143 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.testng.Assert; -import org.testng.annotations.Test; - -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; -import org.broadinstitute.gatk.engine.iterators.GenomeLocusIterator; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; - -import htsjdk.samtools.reference.ReferenceSequence; -import htsjdk.samtools.util.StringUtil; - -import java.util.Collections; -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** Tests for viewing the reference from the perspective of a locus. */ - -public class LocusReferenceViewUnitTest extends ReferenceViewTemplate { - -// -// /** Multiple-base pair queries should generate exceptions. */ -// @Test(expectedExceptions=InvalidPositionException.class) -// public void testSingleBPFailure() { -// Shard shard = new LocusShard(GenomeLocParser.createGenomeLoc(0, 1, 50)); -// -// ShardDataProvider dataProvider = new ShardDataProvider(shard, null, sequenceFile, null); -// LocusReferenceView view = new LocusReferenceView(dataProvider); -// -// view.getReferenceContext(shard.getGenomeLoc()).getBase(); -// } - - @Test - public void testOverlappingReferenceBases() { - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), - sequenceFile.getSequence("chrM").length() - 10, - sequenceFile.getSequence("chrM").length()))); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, sequenceFile, null); - LocusReferenceView view = new LocusReferenceView(dataProvider); - - byte[] results = view.getReferenceBases(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), - sequenceFile.getSequence("chrM").length() - 10, - sequenceFile.getSequence("chrM").length() + 9)); - System.out.printf("results are %s%n", new String(results)); - Assert.assertEquals(results.length, 20); - for (int x = 0; x < results.length; x++) { - if (x <= 10) Assert.assertTrue(results[x] != 'X'); - else Assert.assertTrue(results[x] == 'X'); - } - } - - - /** Queries outside the bounds of the shard should result in reference context window trimmed at the shard boundary. */ - @Test - public void testBoundsFailure() { - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), 1, 50))); - - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, sequenceFile, null); - LocusReferenceView view = new LocusReferenceView(dataProvider); - - GenomeLoc locus = genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), 50, 51); - - ReferenceContext rc = view.getReferenceContext(locus); - Assert.assertTrue(rc.getLocus().equals(locus)); - Assert.assertTrue(rc.getWindow().equals(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(),50))); - Assert.assertTrue(rc.getBases().length == 1); - } - - - /** - * Compares the contents of the fasta and view at a specified location. - * - * @param loc - */ - protected void validateLocation( GenomeLoc loc ) { - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(loc)); - GenomeLocusIterator shardIterator = new GenomeLocusIterator(genomeLocParser,loc); - - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, loc, null, sequenceFile, null); - LocusReferenceView view = new LocusReferenceView(dataProvider); - - while (shardIterator.hasNext()) { - GenomeLoc locus = shardIterator.next(); - - ReferenceSequence expectedAsSeq = sequenceFile.getSubsequenceAt(locus.getContig(), locus.getStart(), locus.getStop()); - char expected = Character.toUpperCase(StringUtil.bytesToString(expectedAsSeq.getBases()).charAt(0)); - char actual = view.getReferenceContext(locus).getBaseAsChar(); - - Assert.assertEquals(actual, expected, String.format("Value of base at position %s in shard %s does not match expected", locus.toString(), shard.getGenomeLocs()) - ); - } - } - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java deleted file mode 100644 index 650b14690..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/LocusViewTemplate.java +++ /dev/null @@ -1,405 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.samtools.reference.ReferenceSequence; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.executive.WindowMaker; -import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.*; -/** - * User: hanna - * Date: May 13, 2009 - * Time: 4:29:08 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** Base support for testing variants of the LocusView family of classes. */ - -public abstract class LocusViewTemplate extends BaseTest { - protected static ReferenceSequenceFile sequenceSourceFile = null; - protected GenomeLocParser genomeLocParser = null; - - @BeforeClass - public void setupGenomeLoc() throws FileNotFoundException { - sequenceSourceFile = fakeReferenceSequenceFile(); - genomeLocParser = new GenomeLocParser(sequenceSourceFile); - } - - @Test - public void emptyAlignmentContextTest() { - SAMRecordIterator iterator = new SAMRecordIterator(); - - GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); - Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); - - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.emptyList()); - } - - @Test - public void singleReadTest() { - GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); - SAMRecordIterator iterator = new SAMRecordIterator(read); - - GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(shardBounds)); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); - } - - @Test - public void readCoveringFirstPartTest() { - GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); - SAMRecordIterator iterator = new SAMRecordIterator(read); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); - } - - @Test - public void readCoveringLastPartTest() { - GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 10); - SAMRecordIterator iterator = new SAMRecordIterator(read); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); - } - - @Test - public void readCoveringMiddleTest() { - GATKSAMRecord read = buildSAMRecord("read1","chr1", 3, 7); - SAMRecordIterator iterator = new SAMRecordIterator(read); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); - } - - @Test - public void readAndLocusOverlapAtLastBase() { - GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); - SAMRecordIterator iterator = new SAMRecordIterator(read); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); - } - - @Test - public void readOverlappingStartTest() { - GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 10); - SAMRecordIterator iterator = new SAMRecordIterator(read); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); - } - - @Test - public void readOverlappingEndTest() { - GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 15); - SAMRecordIterator iterator = new SAMRecordIterator(read); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - testReadsInContext(view, shard.getGenomeLocs(), Collections.singletonList(read)); - } - - @Test - public void readsSpanningTest() { - GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); - GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 6, 10); - SAMRecordIterator iterator = new SAMRecordIterator(read1, read2); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - List expectedReads = new ArrayList(); - Collections.addAll(expectedReads, read1, read2); - testReadsInContext(view, shard.getGenomeLocs(), expectedReads); - } - - @Test - public void duplicateReadsTest() { - GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); - GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 1, 5); - GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 6, 10); - GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 6, 10); - SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - List expectedReads = new ArrayList(); - Collections.addAll(expectedReads, read1, read2, read3, read4); - testReadsInContext(view, shard.getGenomeLocs(), expectedReads); - } - - @Test - public void cascadingReadsWithinBoundsTest() { - GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 2, 6); - GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 3, 7); - GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 4, 8); - GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 5, 9); - SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - List expectedReads = new ArrayList(); - Collections.addAll(expectedReads, read1, read2, read3, read4); - testReadsInContext(view, shard.getGenomeLocs(), expectedReads); - } - - @Test - public void cascadingReadsAtBoundsTest() { - GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); - GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 2, 6); - GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 3, 7); - GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 4, 8); - GATKSAMRecord read5 = buildSAMRecord("read5","chr1", 5, 9); - GATKSAMRecord read6 = buildSAMRecord("read6","chr1", 6, 10); - SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4, read5, read6); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - List expectedReads = new ArrayList(); - Collections.addAll(expectedReads, read1, read2, read3, read4, read5, read6); - testReadsInContext(view, shard.getGenomeLocs(), expectedReads); - } - - @Test - public void cascadingReadsOverlappingBoundsTest() { - GATKSAMRecord read01 = buildSAMRecord("read1","chr1", 1, 5); - GATKSAMRecord read02 = buildSAMRecord("read2","chr1", 2, 6); - GATKSAMRecord read03 = buildSAMRecord("read3","chr1", 3, 7); - GATKSAMRecord read04 = buildSAMRecord("read4","chr1", 4, 8); - GATKSAMRecord read05 = buildSAMRecord("read5","chr1", 5, 9); - GATKSAMRecord read06 = buildSAMRecord("read6","chr1", 6, 10); - GATKSAMRecord read07 = buildSAMRecord("read7","chr1", 7, 11); - GATKSAMRecord read08 = buildSAMRecord("read8","chr1", 8, 12); - GATKSAMRecord read09 = buildSAMRecord("read9","chr1", 9, 13); - GATKSAMRecord read10 = buildSAMRecord("read10","chr1", 10, 14); - GATKSAMRecord read11 = buildSAMRecord("read11","chr1", 11, 15); - GATKSAMRecord read12 = buildSAMRecord("read12","chr1", 12, 16); - SAMRecordIterator iterator = new SAMRecordIterator(read01, read02, read03, read04, read05, read06, - read07, read08, read09, read10, read11, read12); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); - WindowMaker.WindowMakerIterator window = windowMaker.next(); - LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); - LocusView view = createView(dataProvider); - - List expectedReads = new ArrayList(); - Collections.addAll(expectedReads, read01, read02, read03, read04, read05, read06, - read07, read08, read09, read10, read11, read12); - testReadsInContext(view, shard.getGenomeLocs(), expectedReads); - } - - /** - * Creates a view of the type required for testing. - * - * @return The correct view to test. - */ - protected abstract LocusView createView(LocusShardDataProvider provider); - - /** - * Test the reads according to an independently derived context. - * - * @param view - * @param bounds - * @param reads - */ - protected abstract void testReadsInContext(LocusView view, List bounds, List reads); - - /** - * Fake a reference sequence file. Essentially, seek a header with a bunch of dummy data. - * - * @return A 'fake' reference sequence file - */ - private static ReferenceSequenceFile fakeReferenceSequenceFile() { - return new ReferenceSequenceFile() { - public SAMSequenceDictionary getSequenceDictionary() { - SAMSequenceRecord sequenceRecord = new SAMSequenceRecord("chr1", 1000000); - SAMSequenceDictionary dictionary = new SAMSequenceDictionary(Collections.singletonList(sequenceRecord)); - return dictionary; - } - - public boolean isIndexed() { return false; } - - public ReferenceSequence nextSequence() { - throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); - } - - public ReferenceSequence getSequence( String contig ) { - throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); - } - - public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { - throw new UnsupportedOperationException("Fake implementation doesn't support a getter"); - } - - public void reset() { - return; - } - - public void close() throws IOException { - } - }; - } - - /** - * Build a SAM record featuring the absolute minimum required dataset. - * - * @param contig Contig to populate. - * @param alignmentStart start of alignment - * @param alignmentEnd end of alignment - * - * @return New SAM Record - */ - protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { - SAMFileHeader header = new SAMFileHeader(); - header.setSequenceDictionary(sequenceSourceFile.getSequenceDictionary()); - - GATKSAMRecord record = new GATKSAMRecord(header); - - record.setReadName(readName); - record.setReferenceIndex(sequenceSourceFile.getSequenceDictionary().getSequenceIndex(contig)); - record.setAlignmentStart(alignmentStart); - Cigar cigar = new Cigar(); - int len = alignmentEnd - alignmentStart + 1; - cigar.add(new CigarElement(len, CigarOperator.M)); - record.setCigar(cigar); - record.setReadBases(new byte[len]); - record.setBaseQualities(new byte[len]); - return record; - } - - /** A simple iterator which iterates over a list of reads. */ - protected class SAMRecordIterator implements GATKSAMIterator { - private Iterator backingIterator = null; - - public SAMRecordIterator(SAMRecord... reads) { - List backingList = new ArrayList(); - backingList.addAll(Arrays.asList(reads)); - backingIterator = backingList.iterator(); - } - - public boolean hasNext() { - return backingIterator.hasNext(); - } - - public SAMRecord next() { - return backingIterator.next(); - } - - public Iterator iterator() { - return this; - } - - public void close() { - // NO-OP. - } - - public void remove() { - throw new UnsupportedOperationException("Can't remove from a read-only iterator"); - } - } - - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java deleted file mode 100644 index fdec85870..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/providers/ReferenceOrderedViewUnitTest.java +++ /dev/null @@ -1,157 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.providers; - -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.datasources.reads.MockLocusShard; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.datasources.reads.Shard; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.utils.codecs.table.TableFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet.RMDStorageType; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.Arrays; -import java.util.Collections; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -/** - * User: hanna - * Date: May 27, 2009 - * Time: 3:07:23 PM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Test the transparent view into the reference-ordered data. At the moment, just do some basic bindings and make - * sure the data comes through correctly. - */ -public class ReferenceOrderedViewUnitTest extends BaseTest { - /** - * Sequence file. - */ - private static IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - /** - * our track builder - */ - RMDTrackBuilder builder = null; - - @BeforeClass - public void init() throws FileNotFoundException { - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq); - // disable auto-index creation/locking in the RMDTrackBuilder for tests - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); - } - - /** - * Make sure binding to an empty list produces an empty tracker. - */ - @Test - public void testNoBindings() { - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); - LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.emptyList()); - ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",10)); - Assert.assertEquals(tracker.getValues(Feature.class).size(), 0, "The tracker should not have produced any data"); - } - - /** - * Test a single ROD binding. - */ - @Test - public void testSingleBinding() { - String fileName = privateTestDir + "TabularDataTest.dat"; - RMDTriplet triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); - ReferenceOrderedDataSource dataSource = new ReferenceOrderedDataSource(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); - - LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Collections.singletonList(dataSource)); - ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); - TableFeature datum = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest")); - - Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); - Assert.assertEquals(datum.get("COL2"),"D","datum parameter for COL2 is incorrect"); - Assert.assertEquals(datum.get("COL3"),"E","datum parameter for COL3 is incorrect"); - } - - /** - * Make sure multiple bindings are visible from the view. - */ - @Test - public void testMultipleBinding() { - File file = new File(privateTestDir + "TabularDataTest.dat"); - - RMDTriplet testTriplet1 = new RMDTriplet("tableTest1","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); - ReferenceOrderedDataSource dataSource1 = new ReferenceOrderedDataSource(testTriplet1,builder,seq.getSequenceDictionary(),genomeLocParser,false); - - RMDTriplet testTriplet2 = new RMDTriplet("tableTest2","Table",file.getAbsolutePath(),RMDStorageType.FILE,new Tags()); - ReferenceOrderedDataSource dataSource2 = new ReferenceOrderedDataSource(testTriplet2,builder,seq.getSequenceDictionary(),genomeLocParser,false); - - Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chrM",1,30))); - - LocusShardDataProvider provider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, seq, Arrays.asList(dataSource1,dataSource2)); - ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); - - RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20)); - TableFeature datum1 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest1")); - - Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); - Assert.assertEquals(datum1.get("COL2"),"D","datum1 parameter for COL2 is incorrect"); - Assert.assertEquals(datum1.get("COL3"),"E","datum1 parameter for COL3 is incorrect"); - - TableFeature datum2 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest2")); - - Assert.assertEquals(datum2.get("COL1"),"C","datum2 parameter for COL1 is incorrect"); - Assert.assertEquals(datum2.get("COL2"),"D","datum2 parameter for COL2 is incorrect"); - Assert.assertEquals(datum2.get("COL3"),"E","datum2 parameter for COL3 is incorrect"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java deleted file mode 100644 index f1ee6ab78..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ActiveRegionShardBalancerUnitTest.java +++ /dev/null @@ -1,102 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileSpan; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.FileNotFoundException; -import java.util.*; - -public class ActiveRegionShardBalancerUnitTest extends BaseTest { - // example genome loc parser for this test, can be deleted if you don't use the reference - private GenomeLocParser genomeLocParser; - protected SAMDataSource readsDataSource; - - @BeforeClass - public void setup() throws FileNotFoundException { - // sequence - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(10, 0, 10000); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - readsDataSource = null; - } - - @Test - public void testMergingManyContigs() { - executeTest(genomeLocParser.getContigs().getSequences()); - } - - @Test - public void testMergingAllPointersOnSingleContig() { - executeTest(Arrays.asList(genomeLocParser.getContigs().getSequences().get(1))); - } - - @Test - public void testMergingMultipleDiscontinuousContigs() { - final List all = genomeLocParser.getContigs().getSequences(); - executeTest(Arrays.asList(all.get(1), all.get(3))); - } - - private void executeTest(final Collection records) { - final ActiveRegionShardBalancer balancer = new ActiveRegionShardBalancer(); - - final List> expectedLocs = new LinkedList<>(); - final List pointers = new LinkedList<>(); - - for ( final SAMSequenceRecord record : records ) { - final int size = 10; - int end = 0; - for ( int i = 0; i < record.getSequenceLength(); i += size) { - final int myEnd = i + size - 1; - end = myEnd; - final GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getSequenceName(), i, myEnd); - final Map fileSpans = Collections.emptyMap(); - final FilePointer fp = new FilePointer(fileSpans, IntervalMergingRule.ALL, Collections.singletonList(loc)); - pointers.add(fp); - } - expectedLocs.add(Collections.singleton(genomeLocParser.createGenomeLoc(record.getSequenceName(), 0, end))); - } - - balancer.initialize(readsDataSource, pointers.iterator(), genomeLocParser); - - int i = 0; - int nShardsFound = 0; - for ( final Shard shard : balancer ) { - nShardsFound++; - Assert.assertEquals(new HashSet<>(shard.getGenomeLocs()), expectedLocs.get(i++)); - } - Assert.assertEquals(nShardsFound, records.size(), "Didn't find exactly one shard for each contig in the sequence dictionary"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java deleted file mode 100644 index 27c287c9c..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/DownsamplerBenchmark.java +++ /dev/null @@ -1,94 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import com.google.caliper.Param; -import org.broadinstitute.gatk.engine.WalkerManager; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.walkers.LocusWalker; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Apr 22, 2011 - * Time: 4:02:56 PM - * To change this template use File | Settings | File Templates. - */ -public class DownsamplerBenchmark extends ReadProcessingBenchmark { - @Param - private String bamFile; - - @Param - private Integer maxReads; - - @Override - public String getBAMFile() { return bamFile; } - - @Override - public Integer getMaxReads() { return maxReads; } - - @Param - private Downsampling downsampling; - -// public void timeDownsampling(int reps) { -// for(int i = 0; i < reps; i++) { -// SAMFileReader reader = new SAMFileReader(inputFile); -// ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), -// reader.getFileHeader(), -// SAMFileHeader.SortOrder.coordinate, -// false, -// SAMFileReader.ValidationStringency.SILENT, -// downsampling.create(), -// new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), -// Collections.emptyList(), -// Collections.emptyList(), -// false, -// (byte)0, -// false); -// -// GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); -// // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? -// Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); -// LegacyLocusIteratorByState locusIteratorByState = new LegacyLocusIteratorByState(readIterator,readProperties,genomeLocParser, LegacyLocusIteratorByState.sampleListForSAMWithoutReadGroups()); -// while(locusIteratorByState.hasNext()) { -// locusIteratorByState.next().getLocation(); -// } -// reader.close(); -// } -// } - - private enum Downsampling { - NONE { - @Override - DownsamplingMethod create() { return DownsamplingMethod.NONE; } - }, - PER_SAMPLE { - @Override - DownsamplingMethod create() { return WalkerManager.getDownsamplingMethod(LocusWalker.class); } - }; - abstract DownsamplingMethod create(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java deleted file mode 100644 index e35f1d592..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/FilePointerUnitTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.GATKBAMFileSpan; -import htsjdk.samtools.GATKChunk; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; - -/** - * - */ -public class FilePointerUnitTest extends BaseTest { - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - private SAMReaderID readerID = new SAMReaderID("samFile",new Tags()); - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); - } - - @Test - public void testFilePointerCombineDisjoint() { - FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); - one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); - FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10)); - two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2))); - - FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10)); - result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); - - Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); - Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); - - //Now test that adjacent (but disjoint) intervals are properly handled with OVERLAPPING_ONLY - one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5)); - one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); - two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",6,10)); - two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,2))); - - result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, - genomeLocParser.createGenomeLoc("chr1",1,5), - genomeLocParser.createGenomeLoc("chr1",6,10)); - result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); - - Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); - Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); - } - - @Test - public void testFilePointerCombineJoint() { - FilePointer one = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); - one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); - FilePointer two = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",2,6)); - two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3))); - - FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,6)); - result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3))); - - Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); - Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); - - //Repeat the tests for OVERLAPPING_ONLY - one = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,5)); - one.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,2))); - two = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",2,6)); - two.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(1,3))); - - result = new FilePointer(IntervalMergingRule.OVERLAPPING_ONLY, genomeLocParser.createGenomeLoc("chr1",1,6)); - result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,3))); - - Assert.assertEquals(one.combine(genomeLocParser,two),result,"Combination of two file pointers is incorrect"); - Assert.assertEquals(two.combine(genomeLocParser,one),result,"Combination of two file pointers is incorrect"); - } - - @Test - public void testFilePointerCombineOneSided() { - FilePointer filePointer = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,5)); - filePointer.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); - FilePointer empty = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",6,10)); - // Do not add file spans to empty result - - FilePointer result = new FilePointer(IntervalMergingRule.ALL, genomeLocParser.createGenomeLoc("chr1",1,10)); - result.addFileSpans(readerID,new GATKBAMFileSpan(new GATKChunk(0,1))); - Assert.assertEquals(filePointer.combine(genomeLocParser,empty),result,"Combination of two file pointers is incorrect"); - Assert.assertEquals(empty.combine(genomeLocParser,filePointer),result,"Combination of two file pointers is incorrect"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java deleted file mode 100644 index 30eaeb6a1..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/GATKWalkerBenchmark.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import com.google.caliper.Param; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.filters.UnmappedReadFilter; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.walkers.ReadWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.tools.walkers.qc.CountLoci; -import org.broadinstitute.gatk.tools.walkers.qc.CountReads; -import org.broadinstitute.gatk.utils.classloader.JVMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.io.File; -import java.util.Collections; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Feb 25, 2011 - * Time: 10:16:54 AM - * To change this template use File | Settings | File Templates. - */ -public class GATKWalkerBenchmark extends ReadProcessingBenchmark { - @Param - private String bamFile; - - @Param - private Integer maxReads; - - @Param - private String referenceFile; - - @Param - private WalkerType walkerType; - - @Override - public String getBAMFile() { return bamFile; } - - @Override - public Integer getMaxReads() { return maxReads; } - - @Override - public void setUp() { - super.setUp(); - } - - public void timeWalkerPerformance(final int reps) { - for(int i = 0; i < reps; i++) { - GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - - // Establish the argument collection - GATKArgumentCollection argCollection = new GATKArgumentCollection(); - argCollection.referenceFile = new File(referenceFile); - argCollection.samFiles = Collections.singletonList(inputFile.getAbsolutePath()); - - engine.setArguments(argCollection); - // Bugs in the engine mean that this has to be set twice. - engine.setSAMFileIDs(Collections.singletonList(new SAMReaderID(inputFile,new Tags()))); - engine.setFilters(Collections.singletonList(new UnmappedReadFilter())); - engine.setReferenceMetaDataFiles(Collections.emptyList()); - - // Create the walker - engine.setWalker(walkerType.create()); - - engine.execute(); - } - } - - private enum WalkerType { - COUNT_READS { - @Override - Walker create() { return new CountReads(); } - }, - COUNT_BASES_IN_READ { - @Override - Walker create() { return new CountBasesInReadPerformanceWalker(); } - }, - COUNT_LOCI { - @Override - Walker create() { - CountLoci walker = new CountLoci(); - JVMUtils.setFieldValue(JVMUtils.findField(CountLoci.class,"out"),walker,System.out); - return walker; - } - }; - abstract Walker create(); - } -} - -class CountBasesInReadPerformanceWalker extends ReadWalker { - private long As; - private long Cs; - private long Gs; - private long Ts; - - public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { - for(byte base: read.getReadBases()) { - switch(base) { - case 'A': As++; break; - case 'C': Cs++; break; - case 'G': Gs++; break; - case 'T': Ts++; break; - } - } - return 1; - } - - public Long reduceInit() { return 0L; } - public Long reduce(Integer value, Long accum) { return value + accum; } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java deleted file mode 100644 index eb3c89493..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/MockLocusShard.java +++ /dev/null @@ -1,51 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import org.broadinstitute.gatk.engine.datasources.reads.LocusShard; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.util.List; -import java.util.Collections; - -/** - * A mock locus shard, usable for infrastructure that requires a shard to behave properly. - * - * @author mhanna - * @version 0.1 - */ -public class MockLocusShard extends LocusShard { - public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { - super( genomeLocParser, - new SAMDataSource(Collections.emptyList(),new ThreadAllocation(),null,genomeLocParser), - intervals, - null); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java deleted file mode 100644 index be48194b5..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/ReadShardBalancerUnitTest.java +++ /dev/null @@ -1,195 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.*; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; - -public class ReadShardBalancerUnitTest extends BaseTest { - - /** - * Tests to ensure that ReadShardBalancer works as expected and does not place shard boundaries - * at inappropriate places, such as within an alignment start position - */ - private static class ReadShardBalancerTest extends TestDataProvider { - private int numContigs; - private int numStacksPerContig; - private int stackSize; - private int numUnmappedReads; - private DownsamplingMethod downsamplingMethod; - private int expectedReadCount; - - private SAMFileHeader header; - private SAMReaderID testBAM; - - public ReadShardBalancerTest( int numContigs, - int numStacksPerContig, - int stackSize, - int numUnmappedReads, - int downsamplingTargetCoverage ) { - super(ReadShardBalancerTest.class); - - this.numContigs = numContigs; - this.numStacksPerContig = numStacksPerContig; - this.stackSize = stackSize; - this.numUnmappedReads = numUnmappedReads; - - this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null); - this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; - - setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", - getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); - } - - public void run() { - createTestBAM(); - - SAMDataSource dataSource = new SAMDataSource(Arrays.asList(testBAM), - new ThreadAllocation(), - null, - new GenomeLocParser(header.getSequenceDictionary()), - false, - ValidationStringency.SILENT, - ReadShard.DEFAULT_MAX_READS, // reset ReadShard.MAX_READS to ReadShard.DEFAULT_MAX_READS for each test - downsamplingMethod, - new ValidationExclusion(), - new ArrayList(), - false); - - Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); - - SAMRecord readAtEndOfLastShard = null; - int totalReadsSeen = 0; - - for ( Shard shard : shardIterator ) { - int numContigsThisShard = 0; - SAMRecord lastRead = null; - - for ( SAMRecord read : shard.iterator() ) { - totalReadsSeen++; - - if ( lastRead == null ) { - numContigsThisShard = 1; - } - else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { - numContigsThisShard++; - } - - // If the last read from the previous shard is not unmapped, we have to make sure - // that no reads in this shard start at the same position - if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { - Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && - readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), - String.format("Reads from alignment start position %d:%d are split across multiple shards", - read.getReferenceIndex(), read.getAlignmentStart())); - } - - lastRead = read; - } - - // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) - Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); - - readAtEndOfLastShard = lastRead; - } - - Assert.assertEquals(totalReadsSeen, expectedReadCount, "did not encounter the expected number of reads"); - } - - private void createTestBAM() { - header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); - SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); - readGroup.setSample("testSample"); - header.addReadGroup(readGroup); - ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, - "foo", - numContigs, - numStacksPerContig, - stackSize, - stackSize, - 1, - 100, - 50, - 150, - numUnmappedReads); - - final File testBAMFile = createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); - - SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); - for ( SAMRecord read : artificialReads ) { - bamWriter.addAlignment(read); - } - bamWriter.close(); - - testBAM = new SAMReaderID(testBAMFile, new Tags()); - - new File(testBAM.getSamFilePath().replace(".bam", ".bai")).deleteOnExit(); - new File(testBAM.getSamFilePath() + ".bai").deleteOnExit(); - } - } - - @DataProvider(name = "ReadShardBalancerTestDataProvider") - public Object[][] createReadShardBalancerTests() { - for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { - for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { - // Use crucial read shard boundary values as the stack sizes - for ( int stackSize : Arrays.asList(ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS / 2 + 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS - 1, ReadShard.DEFAULT_MAX_READS + 1, ReadShard.DEFAULT_MAX_READS * 2) ) { - for ( int numUnmappedReads : Arrays.asList(0, ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS * 2) ) { - // The first value will result in no downsampling at all, the others in some downsampling - for ( int downsamplingTargetCoverage : Arrays.asList(ReadShard.DEFAULT_MAX_READS * 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS / 2) ) { - new ReadShardBalancerTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); - } - } - } - } - } - - return ReadShardBalancerTest.getTests(ReadShardBalancerTest.class); - } - - @Test(dataProvider = "ReadShardBalancerTestDataProvider") - public void runReadShardBalancerTest( ReadShardBalancerTest test ) { - logger.warn("Running test: " + test); - - test.run(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java deleted file mode 100644 index 526b8ce02..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMDataSourceUnitTest.java +++ /dev/null @@ -1,253 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.testng.annotations.AfterMethod; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import static org.testng.Assert.*; - -/** - *

- * Class SAMDataSourceUnitTest - *

- * The test of the SAMBAM simple data source. - */ -public class SAMDataSourceUnitTest extends BaseTest { - - // TODO: These legacy tests should really be replaced with a more comprehensive suite of tests for SAMDataSource - - private List readers; - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - readers = new ArrayList(); - - // sequence - seq = new CachingIndexedFastaSequenceFile(new File(b36KGReference)); - genomeLocParser = new GenomeLocParser(seq.getSequenceDictionary()); - } - - /** - * Tears down the test fixture after each call. - *

- * Called after every test case method. - */ - @AfterMethod - public void undoForEachTest() { - seq = null; - readers.clear(); - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testLinearBreakIterateAll() { - logger.warn("Executing testLinearBreakIterateAll"); - - // setup the data - readers.add(new SAMReaderID(new File(validationDataLocation+"/NA12878.chrom6.SLX.SRP000032.2009_06.selected.bam"),new Tags())); - - // the sharding strat. - SAMDataSource data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - - Iterable strat = data.createShardIteratorOverMappedReads(new LocusShardBalancer()); - int count = 0; - - try { - for (Shard sh : strat) { - int readCount = 0; - count++; - - GenomeLoc firstLocus = sh.getGenomeLocs().get(0), lastLocus = sh.getGenomeLocs().get(sh.getGenomeLocs().size()-1); - logger.debug("Start : " + firstLocus.getStart() + " stop : " + lastLocus.getStop() + " contig " + firstLocus.getContig()); - logger.debug("count = " + count); - GATKSAMIterator datum = data.seek(sh); - - // for the first couple of shards make sure we can see the reads - if (count < 5) { - for (SAMRecord r : datum) { - } - readCount++; - } - datum.close(); - - // if we're over 100 shards, break out - if (count > 100) { - break; - } - } - } - catch (UserException.CouldNotReadInputFile e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); - } - } - - /** Test that we clear program records when requested */ - @Test - public void testRemoveProgramRecords() { - logger.warn("Executing testRemoveProgramRecords"); - - // setup the data - readers.add(new SAMReaderID(new File(b37GoodBAM),new Tags())); - - // use defaults - SAMDataSource data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - - List defaultProgramRecords = data.getHeader().getProgramRecords(); - assertTrue(defaultProgramRecords.size() != 0, "testRemoveProgramRecords: No program records found when using default constructor"); - - boolean removeProgramRecords = false; - data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - Collections.emptyList(), - false, - (byte) -1, - removeProgramRecords, - false, - null, IntervalMergingRule.ALL); - - List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); - assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); - - removeProgramRecords = true; - data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - Collections.emptyList(), - false, - (byte) -1, - removeProgramRecords, - false, - null, IntervalMergingRule.ALL); - - List doRemoveProgramRecords = data.getHeader().getProgramRecords(); - assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); - } - - @Test(expectedExceptions = UserException.class) - public void testFailOnReducedReads() { - readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); - - SAMDataSource data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - false); - } - - @Test(expectedExceptions = UserException.class) - public void testFailOnReducedReadsRemovingProgramRecords() { - readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); - - SAMDataSource data = new SAMDataSource(readers, - new ThreadAllocation(), - null, - genomeLocParser, - false, - ValidationStringency.SILENT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - Collections.emptyList(), - false, - (byte) -1, - true, - false, - null, IntervalMergingRule.ALL); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java deleted file mode 100644 index bb1cd7521..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reads/SAMReaderIDUnitTest.java +++ /dev/null @@ -1,49 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reads; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; - -public class SAMReaderIDUnitTest extends BaseTest { - - @Test - public void testSAMReaderIDHashingAndEquality() { - // Test to make sure that two SAMReaderIDs that point at the same file via an absolute vs. relative - // path are equal according to equals() and have the same hash code - final File relativePathToBAMFile = new File(publicTestDir + "exampleBAM.bam"); - final File absolutePathToBAMFile = new File(relativePathToBAMFile.getAbsolutePath()); - final SAMReaderID relativePathSAMReaderID = new SAMReaderID(relativePathToBAMFile, new Tags()); - final SAMReaderID absolutePathSAMReaderID = new SAMReaderID(absolutePathToBAMFile, new Tags()); - - Assert.assertEquals(relativePathSAMReaderID, absolutePathSAMReaderID, "Absolute-path and relative-path SAMReaderIDs not equal according to equals()"); - Assert.assertEquals(relativePathSAMReaderID.hashCode(), absolutePathSAMReaderID.hashCode(), "Absolute-path and relative-path SAMReaderIDs have different hash codes"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java deleted file mode 100644 index 46a4cb5f1..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/reference/ReferenceDataSourceIntegrationTest.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.reference; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.io.File; -import java.io.IOException; - -public class ReferenceDataSourceIntegrationTest extends WalkerTest { - - @Test - public void testReferenceWithMissingFaiFile() throws IOException { - final File dummyReference = createTempFile("dummy", ".fasta"); - final File dictFile = new File(dummyReference.getAbsolutePath().replace(".fasta", ".dict")); - dictFile.deleteOnExit(); - Assert.assertTrue(dictFile.createNewFile()); - - final WalkerTestSpec spec = new WalkerTestSpec( - " -T PrintReads" + - " -R " + dummyReference.getAbsolutePath() + - " -I " + privateTestDir + "NA12878.4.snippet.bam" + - " -o %s", - 1, - UserException.MissingReferenceFaiFile.class - ); - - executeTest("testReferenceWithMissingFaiFile", spec); - } - - @Test - public void testReferenceWithMissingDictFile() throws IOException { - final File dummyReference = createTempFile("dummy", ".fasta"); - final File faiFile = new File(dummyReference.getAbsolutePath() + ".fai"); - faiFile.deleteOnExit(); - Assert.assertTrue(faiFile.createNewFile()); - - final WalkerTestSpec spec = new WalkerTestSpec( - " -T PrintReads" + - " -R " + dummyReference.getAbsolutePath() + - " -I " + privateTestDir + "NA12878.4.snippet.bam" + - " -o %s", - 1, - UserException.MissingReferenceDictFile.class - ); - - executeTest("testReferenceWithMissingDictFile", spec); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java deleted file mode 100644 index baa2af098..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedDataPoolUnitTest.java +++ /dev/null @@ -1,208 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.rmd; - -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.codecs.table.TableFeature; -import org.broadinstitute.gatk.engine.refdata.utils.LocationAwareSeekableRODIterator; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet; -import org.broadinstitute.gatk.engine.refdata.utils.RMDTriplet.RMDStorageType; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; - -import static org.testng.Assert.assertTrue; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -/** - * User: hanna - * Date: May 21, 2009 - * Time: 11:03:04 AM - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or - * functionality. - */ - -/** - * Test the contents and number of iterators in the pool. - */ - -public class ReferenceOrderedDataPoolUnitTest extends BaseTest { - - private RMDTriplet triplet = null; - private RMDTrackBuilder builder = null; - - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - private GenomeLoc testSite1; - private GenomeLoc testSite2; - private GenomeLoc testSite3; - - private GenomeLoc testInterval1; // an interval matching testSite1 -> testSite2 for queries - private GenomeLoc testInterval2; // an interval matching testSite2 -> testSite3 for queries - - - @BeforeClass - public void init() throws FileNotFoundException { - seq = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); - genomeLocParser = new GenomeLocParser(seq); - - testSite1 = genomeLocParser.createGenomeLoc("chrM",10); - testSite2 = genomeLocParser.createGenomeLoc("chrM",20); - testSite3 = genomeLocParser.createGenomeLoc("chrM",30); - testInterval1 = genomeLocParser.createGenomeLoc("chrM",10,20); - testInterval2 = genomeLocParser.createGenomeLoc("chrM",20,30); - } - - @BeforeMethod - public void setUp() { - String fileName = privateTestDir + "TabularDataTest.dat"; - - triplet = new RMDTriplet("tableTest","Table",fileName,RMDStorageType.FILE,new Tags()); - // disable auto-index creation/locking in the RMDTrackBuilder for tests - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); - } - - @Test - public void testCreateSingleIterator() { - ResourcePool iteratorPool = new ReferenceOrderedDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); - LocationAwareSeekableRODIterator iterator = (LocationAwareSeekableRODIterator)iteratorPool.iterator( new MappedStreamSegment(testSite1) ); - - Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); - - TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); - - assertTrue(datum.getLocation().equals(testSite1)); - assertTrue(datum.get("COL1").equals("A")); - assertTrue(datum.get("COL2").equals("B")); - assertTrue(datum.get("COL3").equals("C")); - - iteratorPool.release(iterator); - - Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); - } - - @Test - public void testCreateMultipleIterators() { - ReferenceOrderedQueryDataPool iteratorPool = new ReferenceOrderedQueryDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser); - LocationAwareSeekableRODIterator iterator1 = iteratorPool.iterator( new MappedStreamSegment(testInterval1) ); - - // Create a new iterator at position 2. - LocationAwareSeekableRODIterator iterator2 = iteratorPool.iterator( new MappedStreamSegment(testInterval2) ); - - Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); - - // Test out-of-order access: first iterator2, then iterator1. - // Ugh...first call to a region needs to be a seek. - TableFeature datum = (TableFeature)iterator2.seekForward(testSite2).get(0).getUnderlyingObject(); - assertTrue(datum.getLocation().equals(testSite2)); - assertTrue(datum.get("COL1").equals("C")); - assertTrue(datum.get("COL2").equals("D")); - assertTrue(datum.get("COL3").equals("E")); - - datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); - assertTrue(datum.getLocation().equals(testSite1)); - assertTrue(datum.get("COL1").equals("A")); - assertTrue(datum.get("COL2").equals("B")); - assertTrue(datum.get("COL3").equals("C")); - - // Advance iterator2, and make sure both iterator's contents are still correct. - datum = (TableFeature)iterator2.next().get(0).getUnderlyingObject(); - assertTrue(datum.getLocation().equals(testSite3)); - assertTrue(datum.get("COL1").equals("F")); - assertTrue(datum.get("COL2").equals("G")); - assertTrue(datum.get("COL3").equals("H")); - - datum = (TableFeature)iterator1.next().get(0).getUnderlyingObject(); - assertTrue(datum.getLocation().equals(testSite2)); - assertTrue(datum.get("COL1").equals("C")); - assertTrue(datum.get("COL2").equals("D")); - assertTrue(datum.get("COL3").equals("E")); - - // Cleanup, and make sure the number of iterators dies appropriately. - iteratorPool.release(iterator1); - - Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); - - iteratorPool.release(iterator2); - - Assert.assertEquals(iteratorPool.numIterators(), 2, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 2, "Number of available iterators in the pool is incorrect"); - } - - @Test - public void testIteratorConservation() { - ReferenceOrderedDataPool iteratorPool = new ReferenceOrderedDataPool(triplet,builder,seq.getSequenceDictionary(),genomeLocParser,false); - LocationAwareSeekableRODIterator iterator = iteratorPool.iterator( new MappedStreamSegment(testSite1) ); - - Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); - - TableFeature datum = (TableFeature)iterator.next().get(0).getUnderlyingObject(); - assertTrue(datum.getLocation().equals(testSite1)); - assertTrue(datum.get("COL1").equals("A")); - assertTrue(datum.get("COL2").equals("B")); - assertTrue(datum.get("COL3").equals("C")); - - iteratorPool.release(iterator); - - // Create another iterator after the current iterator. - iterator = iteratorPool.iterator( new MappedStreamSegment(testSite3) ); - - // Make sure that the previously acquired iterator was reused. - Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 0, "Number of available iterators in the pool is incorrect"); - - datum = (TableFeature)iterator.seekForward(testSite3).get(0).getUnderlyingObject(); - assertTrue(datum.getLocation().equals(testSite3)); - assertTrue(datum.get("COL1").equals("F")); - assertTrue(datum.get("COL2").equals("G")); - assertTrue(datum.get("COL3").equals("H")); - - iteratorPool.release(iterator); - - Assert.assertEquals(iteratorPool.numIterators(), 1, "Number of iterators in the pool is incorrect"); - Assert.assertEquals(iteratorPool.numAvailableIterators(), 1, "Number of available iterators in the pool is incorrect"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java deleted file mode 100644 index 6c403cd83..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/datasources/rmd/ReferenceOrderedQueryDataPoolUnitTest.java +++ /dev/null @@ -1,89 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.datasources.rmd; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.refdata.utils.*; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.util.List; - -public class ReferenceOrderedQueryDataPoolUnitTest extends BaseTest{ - @Test - public void testCloseFilePointers() throws IOException { - // Build up query parameters - File file = new File(BaseTest.privateTestDir + "NA12878.hg19.example1.vcf"); - RMDTriplet triplet = new RMDTriplet("test", "VCF", file.getAbsolutePath(), RMDTriplet.RMDStorageType.FILE, new Tags()); - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); - GenomeLocParser parser = new GenomeLocParser(seq); - GenomeLoc loc = parser.createGenomeLoc("20", 1, 100000); - TestRMDTrackBuilder builder = new TestRMDTrackBuilder(seq.getSequenceDictionary(), parser); - - // Create the query data pool - ReferenceOrderedQueryDataPool pool = new ReferenceOrderedQueryDataPool(triplet, builder, seq.getSequenceDictionary(), parser); - - for (int i = 0; i < 3; i++) { - // Ensure our tribble iterators are closed. - CheckableCloseableTribbleIterator.clearThreadIterators(); - Assert.assertTrue(CheckableCloseableTribbleIterator.getThreadIterators().isEmpty(), "Tribble iterators list was not cleared."); - - // Request the the rodIterator - LocationAwareSeekableRODIterator rodIterator = pool.iterator(new MappedStreamSegment(loc)); - - // Run normal iteration over rodIterator - Assert.assertTrue(rodIterator.hasNext(), "Rod iterator does not have a next value."); - GenomeLoc rodIteratorLocation = rodIterator.next().getLocation(); - Assert.assertEquals(rodIteratorLocation.getContig(), "20", "Instead of chr 20 rod iterator was at location " + rodIteratorLocation); - - // Check that the underlying tribbleIterators are still open. - List> tribbleIterators = CheckableCloseableTribbleIterator.getThreadIterators(); - Assert.assertFalse(tribbleIterators.isEmpty(), "Tribble iterators list is empty"); - for (CheckableCloseableTribbleIterator tribbleIterator: tribbleIterators) { - Assert.assertFalse(tribbleIterator.isClosed(), "Tribble iterator is closed but should be still open."); - } - - // Releasing the rodIterator should close the underlying tribbleIterator. - pool.release(rodIterator); - - // Check that the underlying tribbleIterators are now closed. - for (CheckableCloseableTribbleIterator tribbleIterator: tribbleIterators) { - Assert.assertTrue(tribbleIterator.isClosed(), "Tribble iterator is open but should be now closed."); - } - } - - // Extra cleanup. - CheckableCloseableTribbleIterator.clearThreadIterators(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java deleted file mode 100644 index 2d86f73c4..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ /dev/null @@ -1,219 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - - -/** - * Basic unit test for AlleleBiasedDownsamplingUtils - */ -public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { - - - @Test - public void testSmartDownsampling() { - - final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; - final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; - - // no contamination, no removal - testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // hom sample, het contaminant, different alleles - testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // hom sample, hom contaminant, different alleles - testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); - - // het sample, het contaminant, different alleles - testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // het sample, hom contaminant, different alleles - testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // hom sample, het contaminant, overlapping alleles - final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; - testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); - - // hom sample, hom contaminant, overlapping alleles - testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); - - // het sample, het contaminant, overlapping alleles - testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); - testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - - // het sample, hom contaminant, overlapping alleles - testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); - } - - private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, - final int pileupSize, final int[] initialCounts, final int[] targetCounts) { - - final int[] actualCounts = initialCounts.clone(); - actualCounts[0] += addA; - actualCounts[1] += addC; - actualCounts[2] += addG; - actualCounts[3] += addT; - - final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); - Assert.assertTrue(countsAreEqual(results, targetCounts)); - } - - private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { - for ( int i = 0; i < 4; i++ ) { - if ( counts1[i] != counts2[i] ) - return false; - } - return true; - } - - @DataProvider(name = "BiasedDownsamplingTest") - public Object[][] makeBiasedDownsamplingTest() { - final List tests = new LinkedList(); - - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - - for ( final int originalCount : Arrays.asList(1, 2, 10, 1000) ) { - for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { - if ( toRemove <= originalCount ) - tests.add(new Object[]{header, originalCount, toRemove}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "BiasedDownsamplingTest") - public void testBiasedDownsampling(final SAMFileHeader header, final int originalCount, final int toRemove) { - - final LinkedList elements = new LinkedList<>(); - for ( int i = 0; i < originalCount; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); - elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); - } - - final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalCount, toRemove); - - Assert.assertEquals(result.size(), toRemove); - } - - @Test - public void testLoadContaminationFileDetails(){ - Logger logger=org.apache.log4j.Logger.getRootLogger(); - - final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt"); - - Map Contam1=new HashMap(); - Set Samples1=new HashSet(); - - Contam1.put("NA11918",0.15); - Samples1.addAll(Contam1.keySet()); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - - Contam1.put("NA12842",0.13); - Samples1.addAll(Contam1.keySet()); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - - Samples1.add("DUMMY"); - testLoadFile(ContamFile1,Samples1,Contam1,logger); - } - - private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){ - Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger); - Assert.assertTrue(loadedMap.equals(map)); - } - - @DataProvider(name = "goodContaminationFiles") - public Integer[][] goodContaminationFiles() { - return new Integer[][]{ - {1, 2}, - {2, 3}, - {3, 2}, - {4, 2}, - {5, 3}, - {6, 2}, - {7, 2}, - {8, 2} - }; - } - - @Test(dataProvider = "goodContaminationFiles") - public void testLoadContaminationFile(final Integer ArtificalBAMnumber, final Integer numberOfSamples) { - final String ArtificialBAM = String.format("ArtificallyContaminatedBams/contamination.case.%d.txt", ArtificalBAMnumber); - Logger logger = org.apache.log4j.Logger.getRootLogger(); - - File ContamFile = new File(privateTestDir, ArtificialBAM); - Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile, 0.0, null, logger).size() == numberOfSamples); - - } - - - @DataProvider(name = "badContaminationFiles") - public Integer[][] badContaminationFiles() { - return new Integer[][]{{1}, {2}, {3}, {4}, {5}}; - } - - @Test(dataProvider = "badContaminationFiles", expectedExceptions = UserException.MalformedFile.class) - public void testLoadBrokenContaminationFile(final int i) { - Logger logger = org.apache.log4j.Logger.getRootLogger(); - final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - - File ContaminationFile = new File(ArtificalBAMLocation + String.format("contamination.case.broken.%d.txt", i)); - AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile, 0.0, null, logger); - - } - - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java deleted file mode 100644 index 2f171de0b..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingIntegrationTest.java +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.annotations.Test; - -public class DownsamplingIntegrationTest extends WalkerTest { - - @Test - public void testDetectLowDcovValueWithLocusTraversal() { - final WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci -R " + publicTestDir + "exampleFASTA.fasta -I " + publicTestDir + "exampleBAM.bam -o %s " + - "-dcov " + (DownsamplingMethod.MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS - 1), - 1, - UserException.class - ); - executeTest("testDetectLowDcovValueWithLocusTraversal", spec); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java deleted file mode 100644 index 19eec62cf..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/DownsamplingReadsIteratorUnitTest.java +++ /dev/null @@ -1,139 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; -import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class DownsamplingReadsIteratorUnitTest extends BaseTest { - - private static class DownsamplingReadsIteratorTest extends TestDataProvider { - private DownsamplingReadsIterator downsamplingIter; - private int targetCoverage; - private ArtificialSingleSampleReadStream stream; - private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; - - public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { - super(DownsamplingReadsIteratorTest.class); - - this.stream = stream; - this.targetCoverage = targetCoverage; - - setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", - getClass().getSimpleName(), - targetCoverage, - stream.getNumContigs(), - stream.getNumStacksPerContig(), - stream.getMinReadsPerStack(), - stream.getMaxReadsPerStack(), - stream.getMinDistanceBetweenStacks(), - stream.getMaxDistanceBetweenStacks(), - stream.getMinReadLength(), - stream.getMaxReadLength(), - stream.getNumUnmappedReads())); - } - - public void run() { - streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); - downsamplingIter = new DownsamplingReadsIterator(stream.getGATKSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); - - streamAnalyzer.analyze(downsamplingIter); - - // Check whether the observed properties of the downsampled stream are what they should be - streamAnalyzer.validate(); - - // Allow memory used by this test to be reclaimed - stream = null; - streamAnalyzer = null; - downsamplingIter = null; - } - } - - @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") - public Object[][] createDownsamplingReadsIteratorTests() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); - String readGroupID = "testReadGroup"; - SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); - readGroup.setSample("testSample"); - header.addReadGroup(readGroup); - - // Values that don't vary across tests - int targetCoverage = 10; - int minReadLength = 50; - int maxReadLength = 100; - int minDistanceBetweenStacks = 1; - int maxDistanceBetweenStacks = maxReadLength + 1; - - GenomeAnalysisEngine.resetRandomGenerator(); - - // brute force testing! - for ( int numContigs : Arrays.asList(1, 2, 5) ) { - for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { - for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { - for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { - for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { - // Only interested in sane read stream configurations here - if ( minReadsPerStack <= maxReadsPerStack ) { - new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, - readGroupID, - numContigs, - stacksPerContig, - minReadsPerStack, - maxReadsPerStack, - minDistanceBetweenStacks, - maxDistanceBetweenStacks, - minReadLength, - maxReadLength, - numUnmappedReads), - targetCoverage); - } - } - } - } - } - } - - return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); - } - - @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") - public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - test.run(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java deleted file mode 100644 index 918537439..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/FractionalDownsamplerUnitTest.java +++ /dev/null @@ -1,158 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.List; - -public class FractionalDownsamplerUnitTest extends BaseTest { - - private static class FractionalDownsamplerTest extends TestDataProvider { - double fraction; - int totalReads; - int expectedMinNumReadsAfterDownsampling; - int expectedMaxNumReadsAfterDownsampling; - int expectedMinDiscardedItems; - int expectedMaxDiscardedItems; - - private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent - - public FractionalDownsamplerTest( double fraction, int totalReads ) { - super(FractionalDownsamplerTest.class); - - this.fraction = fraction; - this.totalReads = totalReads; - - calculateExpectations(); - - setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", - getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); - } - - private void calculateExpectations() { - // Require an exact match in the 0% and 100% cases - if ( fraction == 0.0 ) { - expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; - expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; - } - else if ( fraction == 1.0 ) { - expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; - expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; - } - else { - expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); - expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); - expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; - expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; - } - } - - public Collection createReads() { - Collection reads = new ArrayList(totalReads); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); - - return reads; - } - } - - @DataProvider(name = "FractionalDownsamplerTestDataProvider") - public Object[][] createFractionalDownsamplerTestData() { - for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { - for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { - new FractionalDownsamplerTest(fraction, totalReads); - } - } - - return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); - } - - @Test(dataProvider = "FractionalDownsamplerTestDataProvider") - public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); - - downsampler.submit(test.createReads()); - - if ( test.totalReads > 0 ) { - if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - } - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.totalReads > 0 ) { - if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - } - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && - downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); - - Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && - downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java deleted file mode 100644 index 2544b72fd..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/LevelingDownsamplerUnitTest.java +++ /dev/null @@ -1,163 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; -import org.testng.Assert; - -import java.util.*; - -public class LevelingDownsamplerUnitTest extends BaseTest { - - private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { - public enum DataStructure { LINKED_LIST, ARRAY_LIST } - - int targetSize; - int numStacks; - int stackSize; - DataStructure dataStructure; - int expectedSize; - - public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { - super(LevelingDownsamplerUniformStacksTest.class); - - this.targetSize = targetSize; - this.numStacks = numStacks; - this.stackSize = stackSize; - this.dataStructure = dataStructure; - expectedSize = calculateExpectedDownsampledStackSize(); - - setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", - getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); - } - - public Collection> createStacks() { - Collection> stacks = new ArrayList>(); - - for ( int i = 1; i <= numStacks; i++ ) { - List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); - - for ( int j = 1; j <= stackSize; j++ ) { - stack.add(new Object()); - } - - stacks.add(stack); - } - - return stacks; - } - - private int calculateExpectedDownsampledStackSize() { - int numItemsToRemove = numStacks * stackSize - targetSize; - - if ( numStacks == 0 ) { - return 0; - } - else if ( numItemsToRemove <= 0 ) { - return stackSize; - } - - return Math.max(1, stackSize - (numItemsToRemove / numStacks)); - } - } - - @DataProvider(name = "UniformStacksDataProvider") - public Object[][] createUniformStacksTestData() { - for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { - for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { - for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { - for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { - new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); - } - } - } - } - - return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); - } - - @Test( dataProvider = "UniformStacksDataProvider" ) - public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); - - downsampler.submit(test.createStacks()); - - if ( test.numStacks > 0 ) { - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.numStacks > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - final int sizeFromDownsampler = downsampler.size(); - List> downsampledStacks = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertEquals(downsampledStacks.size(), test.numStacks); - - int totalRemainingItems = 0; - for ( List stack : downsampledStacks ) { - Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); - totalRemainingItems += stack.size(); - } - - Assert.assertEquals(sizeFromDownsampler, totalRemainingItems); - int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); - int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; - - Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - - Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java deleted file mode 100644 index 2606a01d3..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java +++ /dev/null @@ -1,299 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import org.broadinstitute.gatk.engine.iterators.VerifyingSamIterator; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.sam.ArtificialMultiSampleReadStream; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStream; -import org.broadinstitute.gatk.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { - - private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { - - // TODO: tests should distinguish between variance across samples and variance within a sample - - private enum StreamDensity { - SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), - DENSE (1, MIN_READ_LENGTH), - MIXED (1, MAX_READ_LENGTH * 2), - UNIFORM_DENSE (1, 1), - UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); - - int minDistanceBetweenStacks; - int maxDistanceBetweenStacks; - - StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { - this.minDistanceBetweenStacks = minDistanceBetweenStacks; - this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; - } - - public String toString() { - return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); - } - } - - private enum StreamStackDepth { - NON_UNIFORM_LOW (1, 5), - NON_UNIFORM_HIGH (15, 20), - NON_UNIFORM_MIXED (1, 20), - UNIFORM_SINGLE (1, 1), - UNIFORM_LOW (2, 2), - UNIFORM_HIGH (20, 20), - UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing - - int minReadsPerStack; - int maxReadsPerStack; - - StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { - this.minReadsPerStack = minReadsPerStack; - this.maxReadsPerStack = maxReadsPerStack; - } - - public boolean isUniform() { - return minReadsPerStack == maxReadsPerStack; - } - - public String toString() { - return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); - } - } - - private enum StreamStacksPerContig { - UNIFORM(20, 20), - NON_UNIFORM(1, 30); - - int minStacksPerContig; - int maxStacksPerContig; - - StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { - this.minStacksPerContig = minStacksPerContig; - this.maxStacksPerContig = maxStacksPerContig; - } - - public boolean isUniform() { - return minStacksPerContig == maxStacksPerContig; - } - - public String toString() { - return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); - } - } - - // Not interested in testing multiple ranges for the read lengths, as none of our current - // downsamplers are affected by read length - private static final int MIN_READ_LENGTH = 50; - private static final int MAX_READ_LENGTH = 150; - - private ReadsDownsamplerFactory downsamplerFactory; - private int targetCoverage; - private int numSamples; - private int minContigs; - private int maxContigs; - private StreamDensity streamDensity; - private StreamStackDepth streamStackDepth; - private StreamStacksPerContig streamStacksPerContig; - private double unmappedReadsFraction; - private int unmappedReadsCount; - private boolean verifySortedness; - - private ArtificialMultiSampleReadStream mergedReadStream; - private Map perSampleArtificialReadStreams; - private Map perSampleStreamAnalyzers; - private SAMFileHeader header; - - public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, - int targetCoverage, - int numSamples, - int minContigs, - int maxContigs, - StreamDensity streamDensity, - StreamStackDepth streamStackDepth, - StreamStacksPerContig streamStacksPerContig, - double unmappedReadsFraction, - int unmappedReadsCount, - boolean verifySortedness ) { - super(PerSampleDownsamplingReadsIteratorTest.class); - - this.downsamplerFactory = downsamplerFactory; - this.targetCoverage = targetCoverage; - this.numSamples = numSamples; - this.minContigs = minContigs; - this.maxContigs = maxContigs; - this.streamDensity = streamDensity; - this.streamStackDepth = streamStackDepth; - this.streamStacksPerContig = streamStacksPerContig; - this.unmappedReadsFraction = unmappedReadsFraction; - this.unmappedReadsCount = unmappedReadsCount; - this.verifySortedness = verifySortedness; - - header = createHeader(); - createReadStreams(); - - setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", - getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); - } - - private SAMFileHeader createHeader() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); - List readGroups = new ArrayList(numSamples); - List sampleNames = new ArrayList(numSamples); - - for ( int i = 0; i < numSamples; i++ ) { - readGroups.add("ReadGroup" + i); - sampleNames.add("Sample" + i); - } - - return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); - } - - private void createReadStreams() { - perSampleArtificialReadStreams = new HashMap(numSamples); - perSampleStreamAnalyzers = new HashMap(numSamples); - - for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { - String readGroupID = readGroup.getReadGroupId(); - String sampleName = readGroup.getSample(); - - int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); - int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); - - int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; - - ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, - readGroupID, - thisSampleNumContigs, - thisSampleStacksPerContig, - streamStackDepth.minReadsPerStack, - streamStackDepth.maxReadsPerStack, - streamDensity.minDistanceBetweenStacks, - streamDensity.maxDistanceBetweenStacks, - MIN_READ_LENGTH, - MAX_READ_LENGTH, - thisSampleNumUnmappedReads); - perSampleArtificialReadStreams.put(sampleName, thisSampleStream); - perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); - } - - mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); - } - - public void run() { - GATKSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getGATKSAMIterator(), downsamplerFactory); - - if ( verifySortedness ) { - downsamplingIter = new VerifyingSamIterator(downsamplingIter); - } - - while ( downsamplingIter.hasNext() ) { - SAMRecord read = downsamplingIter.next(); - String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; - - ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); - if ( analyzer != null ) { - analyzer.update(read); - } - else { - throw new ReviewedGATKException("bug: stream analyzer for sample " + sampleName + " not found"); - } - } - - for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { - ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); - analyzer.finalizeStats(); - - // Validate the downsampled read stream for each sample individually - analyzer.validate(); - } - - // Allow memory used by this test to be reclaimed: - mergedReadStream = null; - perSampleArtificialReadStreams = null; - perSampleStreamAnalyzers = null; - } - } - - @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") - public Object[][] createPerSampleDownsamplingReadsIteratorTests() { - - GenomeAnalysisEngine.resetRandomGenerator(); - - // Some values don't vary across tests - int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; - ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); - int maxContigs = 3; - boolean verifySortedness = true; - - for ( int numSamples : Arrays.asList(1, 2, 10) ) { - for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { - for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { - for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { - for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { - for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { - for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { - new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, - targetCoverage, - numSamples, - minContigs, - maxContigs, - streamDensity, - streamStackDepth, - streamStacksPerContig, - unmappedReadsFraction, - unmappedReadsCount, - verifySortedness); - } - } - } - } - } - } - } - - return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); - } - - @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") - public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - test.run(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java deleted file mode 100644 index 4e6f157f1..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/ReservoirDownsamplerUnitTest.java +++ /dev/null @@ -1,131 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -public class ReservoirDownsamplerUnitTest extends BaseTest { - - private static class ReservoirDownsamplerTest extends TestDataProvider { - int reservoirSize; - int totalReads; - int expectedNumReadsAfterDownsampling; - int expectedNumDiscardedItems; - - public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { - super(ReservoirDownsamplerTest.class); - - this.reservoirSize = reservoirSize; - this.totalReads = totalReads; - - expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); - expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; - - setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", - getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); - } - - public Collection createReads() { - Collection reads = new ArrayList(totalReads); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); - - return reads; - } - } - - @DataProvider(name = "ReservoirDownsamplerTestDataProvider") - public Object[][] createReservoirDownsamplerTestData() { - for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { - new ReservoirDownsamplerTest(reservoirSize, 0); - for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { - new ReservoirDownsamplerTest(reservoirSize, totalReads); - } - } - - return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); - } - - @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") - public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); - - downsampler.submit(test.createReads()); - - if ( test.totalReads > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.totalReads > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - Assert.assertEquals(downsampler.size(), test.expectedNumReadsAfterDownsampling); - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); - Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java deleted file mode 100644 index e04c347b3..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/downsampling/SimplePositionalDownsamplerUnitTest.java +++ /dev/null @@ -1,331 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.downsampling; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - -public class SimplePositionalDownsamplerUnitTest extends BaseTest { - - private static class SimplePositionalDownsamplerTest extends TestDataProvider { - int targetCoverage; - int numStacks; - List stackSizes; - List expectedStackSizes; - boolean multipleContigs; - int totalInitialReads; - - public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { - super(SimplePositionalDownsamplerTest.class); - - this.targetCoverage = targetCoverage; - this.numStacks = stackSizes.size(); - this.stackSizes = stackSizes; - this.multipleContigs = multipleContigs; - - calculateExpectedDownsampledStackSizes(); - - totalInitialReads = 0; - for ( Integer stackSize : stackSizes ) { - totalInitialReads += stackSize; - } - - setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", - getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); - } - - public Collection createReads() { - Collection reads = new ArrayList(); - SAMFileHeader header = multipleContigs ? - ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : - ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - int refIndex = 0; - int alignmentStart = 1; - int readLength = 100; - - for ( int i = 0; i < numStacks; i++ ) { - if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { - refIndex++; - } - - reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", - refIndex, alignmentStart, readLength)); - - alignmentStart += 10; - } - - return reads; - } - - private void calculateExpectedDownsampledStackSizes() { - expectedStackSizes = new ArrayList(numStacks); - - for ( Integer stackSize : stackSizes ) { - int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; - expectedStackSizes.add(expectedSize); - } - } - } - - @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") - public Object[][] createSimplePositionalDownsamplerTestData() { - GenomeAnalysisEngine.resetRandomGenerator(); - - for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { - for ( int contigs = 1; contigs <= 2; contigs++ ) { - for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { - List stackSizes = new ArrayList(numStacks); - for ( int stack = 1; stack <= numStacks; stack++ ) { - stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); - } - new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); - } - } - } - - return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); - } - - @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) - public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - - ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); - - downsampler.submit(test.createReads()); - - if ( test.numStacks > 1 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else if ( test.numStacks == 1 ) { - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - downsampler.signalEndOfInput(); - - if ( test.numStacks > 0 ) { - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - } - else { - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - } - - List downsampledReads = downsampler.consumeFinalizedItems(); - Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); - - if ( test.numStacks == 0 ) { - Assert.assertTrue(downsampledReads.isEmpty()); - } - else { - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); - - Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); - Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); - - int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); - int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); - Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); - } - - downsampler.resetStats(); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); - } - - private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { - List stackSizes = new ArrayList(); - - if ( downsampledReads.isEmpty() ) { - return stackSizes; - } - - Iterator iter = downsampledReads.iterator(); - Assert.assertTrue(iter.hasNext()); - - SAMRecord previousRead = iter.next(); - int currentStackSize = 1; - - while ( iter.hasNext() ) { - SAMRecord currentRead = iter.next(); - - if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { - stackSizes.add(currentStackSize); - currentStackSize = 1; - } - else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { - Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); - } - else { - currentStackSize++; - } - - previousRead = currentRead; - } - - stackSizes.add(currentStackSize); - return stackSizes; - } - - @Test - public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection readStack = new ArrayList(); - readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); - downsampler.submit(readStack); - - Assert.assertFalse(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() == null); - Assert.assertTrue(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() != null); - - SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); - downsampler.signalNoMoreReadsBefore(laterRead); - - Assert.assertTrue(downsampler.hasFinalizedItems()); - Assert.assertTrue(downsampler.peekFinalized() != null); - Assert.assertFalse(downsampler.hasPendingItems()); - Assert.assertTrue(downsampler.peekPending() == null); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(downsampledReads.size(), readStack.size()); - } - - @Test - public void testBasicUnmappedReadsSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection readStack = new ArrayList(); - readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, 100)); - for ( SAMRecord read : readStack ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - downsampler.submit(readStack); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler - Assert.assertEquals(downsampledReads.size(), readStack.size()); - - for ( SAMRecord read: downsampledReads ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - } - - @Test - public void testMixedMappedAndUnmappedReadsSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - Collection mappedReadStack = new ArrayList(); - mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); - for ( SAMRecord read : mappedReadStack ) { - Assert.assertFalse(read.getReadUnmappedFlag()); - } - - Collection unmappedReadStack = new ArrayList(); - unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, - SAMRecord.NO_ALIGNMENT_START, 100)); - for ( SAMRecord read : unmappedReadStack ) { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - downsampler.submit(mappedReadStack); - downsampler.submit(unmappedReadStack); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeFinalizedItems(); - - // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler - Assert.assertEquals(downsampledReads.size(), 300); - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); - - int count = 1; - for ( SAMRecord read: downsampledReads ) { - if ( count <= 100 ) { - Assert.assertFalse(read.getReadUnmappedFlag()); - } - else { - Assert.assertTrue(read.getReadUnmappedFlag()); - } - - count++; - } - } - - @Test - public void testGATKSAMRecordSupport() { - ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - List reads = new ArrayList(); - for ( int i = 0; i < 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - List downsampledReads = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(downsampledReads.size(), 10); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java deleted file mode 100644 index 7a0122085..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/AllowNCigarMalformedReadFilterUnitTest.java +++ /dev/null @@ -1,77 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - - -import htsjdk.samtools.SAMRecord; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.Collections; - - -/** - * Tests for the {@link MalformedReadFilter} when the unsafe flag - * {@link ValidationExclusion.TYPE#ALLOW_N_CIGAR_READS} is set. - * - * @author Valentin Ruano-Rubio - * @since 6/6/13 - */ -public class AllowNCigarMalformedReadFilterUnitTest extends MalformedReadFilterUnitTest { - - - @Override - protected ValidationExclusion composeValidationExclusion() { - return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS)); - } - - - @Test(enabled = true, - dataProvider= "UnsupportedCigarOperatorDataProvider") - @CigarOperatorTest(CigarOperatorTest.Outcome.IGNORE) - public void testCigarNOperatorFilterIgnore(final String cigarString) { - - final MalformedReadFilter filter = buildMalformedReadFilter(false); - final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); - Assert.assertFalse(filter.filterOut(nContainingCigarRead), - "filters out N containing Cigar when it should ignore the fact"); - } - - @Test(enabled = false) - @Override - public void testCigarNOperatorFilterException(final String cigarString) { - // Nothing to do here. - // Just deactivates the parents test case. - } - - - - - - - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java deleted file mode 100644 index 3ff8ed4dc..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/BadReadGroupsIntegrationTest.java +++ /dev/null @@ -1,52 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.annotations.Test; - - -public class BadReadGroupsIntegrationTest extends WalkerTest { - - @Test - public void testMissingReadGroup() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T PrintReads -R " + b36KGReference + " -I " + privateTestDir + "missingReadGroup.bam -o /dev/null", - 0, - UserException.ReadMissingReadGroup.class); - executeTest("test Missing Read Group", spec); - } - - @Test - public void testUndefinedReadGroup() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T PrintReads -R " + b36KGReference + " -I " + privateTestDir + "undefinedReadGroup.bam -o /dev/null", - 0, - UserException.ReadHasUndefinedReadGroup.class); - executeTest("test Undefined Read Group", spec); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java deleted file mode 100644 index d25db50e0..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/MalformedReadFilterUnitTest.java +++ /dev/null @@ -1,246 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - - -import htsjdk.samtools.Cigar; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.TextCigarCodec; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.exceptions.UserException.UnsupportedCigarOperatorException; - -import java.lang.annotation.*; -import java.lang.reflect.Method; -import java.util.*; - - -/** - * Tests for the MalformedReadFilter - * - * @author Eric Banks - * @since 3/14/13 - */ -public class MalformedReadFilterUnitTest extends ReadFilterTest { - - ////////////////////////////////////// - // Test the checkSeqStored() method // - ////////////////////////////////////// - - @Test(enabled = true) - public void testCheckSeqStored () { - - final GATKSAMRecord goodRead = ArtificialSAMUtils.createArtificialRead(new byte[]{(byte)'A'}, new byte[]{(byte)'A'}, "1M"); - final GATKSAMRecord badRead = ArtificialSAMUtils.createArtificialRead(new byte[]{}, new byte[]{}, "1M"); - badRead.setReadString("*"); - - Assert.assertTrue(MalformedReadFilter.checkSeqStored(goodRead, true)); - Assert.assertFalse(MalformedReadFilter.checkSeqStored(badRead, true)); - - try { - MalformedReadFilter.checkSeqStored(badRead, false); - Assert.assertTrue(false, "We should have exceptioned out in the previous line"); - } catch (UserException e) { } - } - - @Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider") - @CigarOperatorTest(CigarOperatorTest.Outcome.FILTER) - public void testCigarNOperatorFilterTruePositive(String cigarString) { - - final MalformedReadFilter filter = buildMalformedReadFilter(true); - final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); - Assert.assertTrue(filter.filterOut(nContainingCigarRead), - " Did not filtered out a N containing CIGAR read"); - } - - @Test(enabled = true, dataProvider= "UnsupportedCigarOperatorDataProvider") - @CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT) - public void testCigarNOperatorFilterTrueNegative(String cigarString) { - - final MalformedReadFilter filter = buildMalformedReadFilter(true); - final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString); - Assert.assertFalse(filter.filterOut(nonNContainingCigarRead), - " Filtered out a non-N containing CIGAR read"); - } - - @Test(enabled = true, - expectedExceptions = UnsupportedCigarOperatorException.class, - dataProvider= "UnsupportedCigarOperatorDataProvider") - @CigarOperatorTest(CigarOperatorTest.Outcome.EXCEPTION) - public void testCigarNOperatorFilterException(final String cigarString) { - - final MalformedReadFilter filter = buildMalformedReadFilter(false); - final SAMRecord nContainingCigarRead = buildSAMRecord(cigarString); - - filter.filterOut(nContainingCigarRead); - } - - @Test(enabled = true, dataProvider="UnsupportedCigarOperatorDataProvider") - @CigarOperatorTest(CigarOperatorTest.Outcome.ACCEPT) - public void testCigarNOperatorFilterControl(final String cigarString) { - - final MalformedReadFilter filter = buildMalformedReadFilter(false); - final SAMRecord nonNContainingCigarRead = buildSAMRecord(cigarString); - - Assert.assertFalse(filter.filterOut(nonNContainingCigarRead)); - } - - protected SAMRecord buildSAMRecord(final String cigarString) { - final Cigar nContainingCigar = TextCigarCodec.getSingleton().decode(cigarString); - return this.createRead(nContainingCigar, 1, 0, 10); - } - - protected MalformedReadFilter buildMalformedReadFilter(final boolean filterRNO) { - return buildMalformedReadFiter(filterRNO,new ValidationExclusion.TYPE[] {}); - } - - protected MalformedReadFilter buildMalformedReadFiter(boolean filterRNO, final ValidationExclusion.TYPE... excl) { - final ValidationExclusion ve = new ValidationExclusion(Arrays.asList(excl)); - - final MalformedReadFilter filter = new MalformedReadFilter(); - - final SAMFileHeader h = getHeader(); - final SAMDataSource ds = getDataSource(); - - final GenomeAnalysisEngine gae = new GenomeAnalysisEngine() { - @Override - public SAMFileHeader getSAMFileHeader() { - return h; - } - - @Override - public SAMDataSource getReadsDataSource() { - return ds; - } - }; - filter.initialize(gae); - filter.filterReadsWithNCigar = filterRNO; - return filter; - } - - @Retention(RetentionPolicy.RUNTIME) - @Target(ElementType.METHOD) - @Inherited - protected @interface CigarOperatorTest { - - enum Outcome { - ANY,ACCEPT,FILTER,EXCEPTION,IGNORE; - - public boolean appliesTo (String cigar) { - boolean hasN = cigar.indexOf('N') != -1; - switch (this) { - case ANY: return true; - case ACCEPT: return !hasN; - case IGNORE: return hasN; - case FILTER: - case EXCEPTION: - default: - return hasN; - - } - } - } - - Outcome value() default Outcome.ANY; - } - - /** - * Cigar test data for unsupported operator test. - * Each element of this array corresponds to a test case. In turn the first element of the test case array is the - * Cigar string for that test case and the second indicates whether it should be filtered due to the presence of a - * unsupported operator - */ - private static final String[] TEST_CIGARS = { - "101M10D20I10M", - "6M14N5M", - "1N", - "101M", - "110N", - "2N4M", - "4M2N", - "3M1I1M", - "1M2I2M", - "1M10N1I1M", - "1M1I1D", - "11N12M1I34M12N" - }; - - @DataProvider(name= "UnsupportedCigarOperatorDataProvider") - public Iterator unsupportedOperatorDataProvider(final Method testMethod) { - final CigarOperatorTest a = resolveCigarOperatorTestAnnotation(testMethod); - final List result = new LinkedList(); - for (final String cigarString : TEST_CIGARS) { - if (a == null || a.value().appliesTo(cigarString)) { - result.add(new Object[] { cigarString }); - } - } - return result.iterator(); - } - - /** - * Gets the most specific {@link CigarOperatorTest} annotation for the - * signature of the test method provided. - *

- * This in-house implementation is required due to the fact that method - * annotations do not have inheritance. - * - * @param m targeted test method. - * @return null if there is no {@link CigarOperatorTest} - * annotation in this or overridden methods. - */ - private CigarOperatorTest resolveCigarOperatorTestAnnotation(final Method m) { - CigarOperatorTest res = m.getAnnotation(CigarOperatorTest.class); - if (res != null) { - return res; - } - Class c = this.getClass(); - Class p = c.getSuperclass(); - while (p != null && p != Object.class) { - try { - final Method met = p.getDeclaredMethod(m.getName(), - m.getParameterTypes()); - res = met.getAnnotation(CigarOperatorTest.class); - if (res != null) { - break; - } - } catch (NoSuchMethodException e) { - // Its ok; nothing to do here, just keep looking. - } - c = p; - p = c.getSuperclass(); - } - return res; - } - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java deleted file mode 100644 index 0f61de248..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/ReadFilterTest.java +++ /dev/null @@ -1,370 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.testng.annotations.AfterClass; -import org.testng.annotations.BeforeClass; - -import java.util.*; - -/** - * Class ReadBaseTest - *

- * This is the base test class for read filter test classes. All read - * filter test cases should extend from this - * class; it sets ups a header mock up to test read filtering. - * - * Feel free to override non-final method to modify the behavior - * (i.e. change how read group id are formatted, or complete a header). - * - *

- * You can statically determine the number of read-group involved - * in the test by calling {@link #ReadFilterTest(int)} in you constructor. - *

- * - * Notice that the same header object is shared by all test and - * it is initialized by Junit (calling {@link #beforeClass()}. - * - * @author Valentin Ruano Rubio - * @date May 23, 2013 - */ -public class ReadFilterTest extends BaseTest { - - private static final int DEFAULT_READ_GROUP_COUNT = 5; - private static final int DEFAULT_READER_COUNT = 1; - private static final String DEFAULT_READ_GROUP_PREFIX = "ReadGroup"; - private static final String DEFAULT_PLATFORM_UNIT_PREFIX = "Lane"; - private static final String DEFAULT_SAMPLE_NAME_PREFIX = "Sample"; - private static final String DEFAULT_PLATFORM_PREFIX = "Platform"; - private static final int DEFAULT_CHROMOSOME_COUNT = 1; - private static final int DEFAULT_CHROMOSOME_START_INDEX = 1; - private static final int DEFAULT_CHROMOSOME_SIZE = 1000; - private static final String DEFAULT_SAM_FILE_FORMAT = "readfile-%3d.bam"; - - private final int groupCount; - - private SAMFileHeader header; - - private SAMDataSource dataSource; - - /** - * Constructs a new read-filter test providing the number of read - * groups in the file. - * - * @param groupCount number of read-group in the fictional SAM file, - * must be equal or greater than 1. - */ - protected ReadFilterTest(final int groupCount) { - if (groupCount < 1) { - throw new IllegalArgumentException( - "the read group count must at least be 1"); - } - this.groupCount = groupCount; - } - - - /** - * Gets the data source. - * - * @throws IllegalStateException if the data source was not initialized - * invoking {@link #beforeClass()} - * @return never null - */ - protected final SAMDataSource getDataSource() { - checkDataSourceExists(); - return dataSource; - } - - /** - * Returns the mock-up SAM file header for testing. - * - * @throws IllegalStateException if the header was not initialized - * invoking {@link #beforeClass()} - * @return never null - */ - protected final SAMFileHeader getHeader() { - checkHeaderExists(); - return header; - } - - /** - * Construct a read filter test with the default number of groups - * ({@link #DEFAULT_READ_GROUP_COUNT}. - */ - public ReadFilterTest() { - this(DEFAULT_READ_GROUP_COUNT); - } - - /** - * Return the number of read groups involved in the test - * @return 1 or greater. - */ - protected final int getReadGroupCount() { - return groupCount; - } - - /** - * Composes the Id for the read group given its index. - * - * This methods must return a unique distinct ID for each possible index and - * it must be the same value each time it is invoked. - * - * @param index the index of the targeted read group in the range - * [1,{@link #getReadGroupCount()}] - * @return never null and must be unique to each possible - * read group index. - */ - protected String composeReadGroupId(final int index) { - checkReadGroupIndex(index); - return DEFAULT_READ_GROUP_PREFIX + index; - } - - /** - * Composes the Platform name for the read group given its index. - * - * This method must always return the same value give an index. - * - * @param index the index of the targeted read group in the range - * [1,{@link #getReadGroupCount()}] - * @return never null. - */ - protected String composePlatformName(final int index) { - checkReadGroupIndex(index); - return DEFAULT_PLATFORM_PREFIX + (((index-1)%2)+1); - } - - - /** - * Composes the Platform unit name for the read group given its index. - * - * @param index the index of the targeted read group in the range - * [1,{@link #getReadGroupCount()}] - * @return never null. - */ - protected String composePlatformUnitName(final int index) { - checkReadGroupIndex(index); - return DEFAULT_PLATFORM_UNIT_PREFIX + (((index-1)%3)+1); - } - - - - /** - * Checks the correctness of a given read group index. - * - * A correct index is any value in the range [1,{@link #getReadGroupCount()}]. - * - * @param index the target index. - * @throws IllegalArgumentException if the input index is not correct. - */ - protected final void checkReadGroupIndex(final int index) { - checkIndex(index,groupCount,"read group"); - } - - - private void checkIndex(final int index, final int max, CharSequence name) { - if (index < 1 || index > max) { - throw new IllegalArgumentException( - name + " index (" - + index - + ") is out of bounds [1," + max + "]"); - } - } - - - /** - * Checks whether the header was initialized. - * - * @throws IllegalStateException if the header was not yet initialized. - */ - protected final void checkHeaderExists() { - if (header == null) { - throw new IllegalArgumentException( - "header has not been initialized;" - + " beforeClass() was not invoked"); - } - } - - /** - * Checks whether the data source was initialized. - * - * @throws IllegalStateException if the data source was not yet initialized. - */ - protected final void checkDataSourceExists() { - if (header == null) { - throw new IllegalArgumentException( - "data source has not been initialized;" - + " beforeClass() was not invoked"); - } - } - - /** - * Returns the ID for a read group given its index. - * - * @param index the index of the targeted read group in the range - * [1,{@link #getReadGroupCount()}] - * @return never null and must be unique to each - * possible read group index. - */ - protected final String getReadGroupId(final int index) { - checkReadGroupIndex(index); - return getHeader().getReadGroups().get(index - 1).getReadGroupId(); - } - - /** - * Returns the platform name for a read group given its index. - * - * @param group the index of the targeted read group in the range - * [1,{@link #getReadGroupCount()}] - * @return never null. - */ - protected final String getPlatformName(final int group) { - checkReadGroupIndex(group); - return getHeader().getReadGroups().get(group - 1).getPlatform(); - } - - /** - * Returns the platform unit for a read group given its index. - * - * @param group the index of the targeted read group in the range - * [1,{@link #getReadGroupCount()}] - * @return never null. - */ - protected final String getPlatformUnit(final int group) { - checkReadGroupIndex(group); - return getHeader().getReadGroups().get(group - 1).getPlatformUnit(); - } - - - /** - * Composes the mock up SAM file header. - * - * It must return an equivalent (equal) value each time it is invoked. - * - * @return never null. - */ - protected SAMFileHeader composeHeader() { - - return ArtificialSAMUtils.createArtificialSamHeader( - DEFAULT_CHROMOSOME_COUNT, DEFAULT_CHROMOSOME_START_INDEX, - DEFAULT_CHROMOSOME_SIZE); - } - - @BeforeClass - public void beforeClass() { - - header = composeHeader(); - dataSource = composeDataSource(); - final List readGroupIDs = new ArrayList(); - final List sampleNames = new ArrayList(); - - for (int i = 1; i <= getReadGroupCount(); i++) { - final String readGroupId = composeReadGroupId(i); - readGroupIDs.add(readGroupId); - sampleNames.add(readGroupId); - } - - ArtificialSAMUtils.createEnumeratedReadGroups( - header, readGroupIDs, sampleNames); - - for (int i = 1; i <= getReadGroupCount(); i++) { - final String readGroupId = readGroupIDs.get(i-1); - final SAMReadGroupRecord groupRecord = header.getReadGroup(readGroupId); - groupRecord.setAttribute("PL", composePlatformName(i)); - groupRecord.setAttribute("PU", composePlatformUnitName(i)); - } - - } - - protected ValidationExclusion composeValidationExclusion() { - return new ValidationExclusion(); - } - - protected SAMDataSource composeDataSource() { - checkHeaderExists(); - final Set readerIDs = new HashSet<>(1); - final ThreadAllocation ta = new ThreadAllocation(); - final Integer numFileHandles = 1; // I believe that any value would do but need to confirm. - final boolean useOriginalBaseQualities = true; - final ValidationStringency strictness = ValidationStringency.LENIENT; - final Integer readBufferSize = 1; // not relevant. - final DownsamplingMethod downsamplingMethod = DownsamplingMethod.NONE; - final ValidationExclusion exclusionList = composeValidationExclusion(); - final Collection supplementalFilters = Collections.EMPTY_SET; - final boolean includeReadsWithDeletionAtLoci = true; - - final GenomeLocParser glp = new GenomeLocParser(header.getSequenceDictionary()); - final SAMDataSource res = new SAMDataSource( - readerIDs, - ta, - numFileHandles, - glp, - useOriginalBaseQualities, - strictness, - readBufferSize, - downsamplingMethod, - exclusionList, - supplementalFilters, - includeReadsWithDeletionAtLoci); - - return res; - } - - @AfterClass - public void afterClass() { - header = null; - dataSource = null; - } - - /** - * Creates a read record. - * - * @param cigar the new record CIGAR. - * @param group the new record group index that must be in the range \ - * [1,{@link #getReadGroupCount()}] - * @param reference the reference sequence index (0-based) - * @param start the start position of the read alignment in the reference - * (1-based) - * @return never null - */ - protected SAMRecord createRead(final Cigar cigar, final int group, final int reference, final int start) { - final SAMRecord record = ArtificialSAMUtils.createArtificialRead(cigar); - record.setHeader(getHeader()); - record.setAlignmentStart(start); - record.setReferenceIndex(reference); - record.setAttribute(SAMTag.RG.toString(), getReadGroupId(group)); - return record; - - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java deleted file mode 100644 index a00f0a0ba..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/filters/UnsafeMalformedReadFilterUnitTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.filters; - - -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; - -import java.util.Collections; - - -/** - * Tests for the {@link MalformedReadFilter} when the unsafe flag - * {@link ValidationExclusion.TYPE#ALL} is set. - * - * @author Valentin Ruano-Rubio - * @since 6/6/13 - */ -public class UnsafeMalformedReadFilterUnitTest extends AllowNCigarMalformedReadFilterUnitTest { - - - @Override - protected ValidationExclusion composeValidationExclusion() { - return new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)); - } - - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java deleted file mode 100644 index 7c3aca2d1..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/BoundedReadIteratorUnitTest.java +++ /dev/null @@ -1,145 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import static org.testng.Assert.fail; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.broadinstitute.gatk.utils.BaseTest; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - *

- * Class BoundedReadIteratorUnitTest - *

- * tests for the bounded read iterator. - */ -public class BoundedReadIteratorUnitTest extends BaseTest { - - /** the file list and the fasta sequence */ - private List fl; - private ReferenceSequenceFile seq; - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - fl = new ArrayList(); - } - - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testBounding() { - logger.warn("Executing testBounding"); - // total reads expected - final int expected = 20; - // bound by ten reads - BoundedReadIterator iter = new BoundedReadIterator(new testIterator(), expected); - - int count = 0; - for (SAMRecord rec: iter) { - count++; - } - - Assert.assertEquals(count, expected); - } -} - -class testIterator implements GATKSAMIterator { - SAMFileHeader header; - testIterator() { - header = ArtificialSAMUtils.createArtificialSamHeader(1,1,2000); - } - - public void close() { - - } - - public boolean hasNext() { - return true; - } - - public SAMRecord next() { - return ArtificialSAMUtils.createArtificialRead(header,"blah",0,1,100); - } - - public void remove() { - } - - public Iterator iterator() { - return this; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java deleted file mode 100644 index 6cbd4fd1b..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/GATKSAMIteratorAdapterUnitTest.java +++ /dev/null @@ -1,176 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.BaseTest; -import static org.testng.Assert.assertEquals; -import org.testng.annotations.Test; - -import java.util.Iterator; - -/** - * - * User: aaron - * Date: May 13, 2009 - * Time: 6:58:21 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date May 13, 2009 - *

- * Class GATKSAMIteratorTest - *

- * Tests the GATKSAMIteratorAdapter class. - */ -public class GATKSAMIteratorAdapterUnitTest extends BaseTest { - - class MyTestIterator implements Iterator { - - public int count = 0; - - public MyTestIterator() { - count = 0; - } - - public boolean hasNext() { - if (count < 100) { - ++count; - return true; - } else { - return false; - } - } - - public SAMRecord next() { - return null; - } - - public void remove() { - throw new UnsupportedOperationException("Unsupported"); - } - } - - class MyTestCloseableIterator implements CloseableIterator { - public int count = 0; - - public MyTestCloseableIterator() { - count = 0; - } - - public boolean hasNext() { - if (count < 100) { - ++count; - return true; - } else { - return false; - } - } - - public SAMRecord next() { - return null; - } - - public void remove() { - throw new UnsupportedOperationException("Unsupported"); - } - - public void close() { - count = -1; - } - } - - - @Test - public void testNormalIterator() { - final int COUNT = 100; - MyTestIterator it = new MyTestIterator(); - - GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); - int countCheck = 0; - while (samIt.hasNext()) { - samIt.next(); - ++countCheck; - //logger.warn("cnt = " + countCheck); - } - - assertEquals(countCheck, COUNT); - - assertEquals(countCheck, COUNT); - } - - @Test - public void testCloseableIterator() { - final int COUNT = 100; - - MyTestCloseableIterator it = new MyTestCloseableIterator(); - - GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); - - int countCheck = 0; - while (samIt.hasNext()) { - samIt.next(); - ++countCheck; - } - - assertEquals(countCheck, COUNT); - } - - @Test - public void testCloseOnCloseableIterator() { - final int COUNT = 100; - - MyTestCloseableIterator it = new MyTestCloseableIterator(); - - GATKSAMIterator samIt = GATKSAMIteratorAdapter.adapt(it); - - - int countCheck = 0; - while (samIt.hasNext()) { - samIt.next(); - ++countCheck; - } - - assertEquals(countCheck, COUNT); - - // check to see that the count get's set to -1 - samIt.close(); - assertEquals(it.count, -1); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java deleted file mode 100644 index c926d066c..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/ReadFormattingIteratorUnitTest.java +++ /dev/null @@ -1,50 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.util.Arrays; - - -public class ReadFormattingIteratorUnitTest extends BaseTest { - - @Test - public void testIteratorConsolidatesCigars() { - final Cigar unconsolidatedCigar = TextCigarCodec.getSingleton().decode("3M0M5M0M"); - final SAMRecord unconsolidatedRead = ArtificialSAMUtils.createArtificialRead(unconsolidatedCigar); - - final GATKSAMIterator readIterator = GATKSAMIteratorAdapter.adapt(Arrays.asList(unconsolidatedRead).iterator()); - final ReadFormattingIterator formattingIterator = new ReadFormattingIterator(readIterator, false, (byte)-1); - final SAMRecord postIterationRead = formattingIterator.next(); - - Assert.assertEquals(postIterationRead.getCigarString(), "8M", "Cigar 3M0M5M0M not consolidated correctly by ReadFormattingIterator"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java deleted file mode 100644 index 371f94f00..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/iterators/VerifyingSamIteratorUnitTest.java +++ /dev/null @@ -1,128 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.iterators; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.Arrays; -import java.util.List; - -/** - * Created by IntelliJ IDEA. - * User: mhanna - * Date: Mar 2, 2011 - * Time: 9:48:10 PM - * To change this template use File | Settings | File Templates. - */ -public class VerifyingSamIteratorUnitTest { - private SAMFileHeader samFileHeader; - - @BeforeClass - public void init() { - SAMSequenceDictionary sequenceDictionary = new SAMSequenceDictionary(); - sequenceDictionary.addSequence(new SAMSequenceRecord("1",500)); - sequenceDictionary.addSequence(new SAMSequenceRecord("2",500)); - - samFileHeader = new SAMFileHeader(); - samFileHeader.setSequenceDictionary(sequenceDictionary); - } - - @Test - public void testSortedReadsBasic() { - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),1,10); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10); - List reads = Arrays.asList(read1,read2); - - VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); - - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - Assert.assertSame(iterator.next(),read2,"Incorrect read in read 2 position"); - Assert.assertFalse(iterator.hasNext(),"Too many reads in iterator"); - } - - @Test - public void testSortedReadsAcrossContigs() { - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10); - List reads = Arrays.asList(read1,read2); - - VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); - - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - Assert.assertSame(iterator.next(),read2,"Incorrect read in read 2 position"); - Assert.assertFalse(iterator.hasNext(),"Too many reads in iterator"); - } - - @Test(expectedExceptions=UserException.MissortedBAM.class) - public void testImproperlySortedReads() { - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10); - List reads = Arrays.asList(read1,read2); - - VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); - - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - - // Should trigger MissortedBAM exception. - iterator.next(); - } - - @Test(expectedExceptions=UserException.MalformedBAM.class) - public void testInvalidAlignment() { - // Create an invalid alignment state. - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),1,10); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read1",getContig(0).getSequenceIndex(),2,10); - read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); - List reads = Arrays.asList(read1,read2); - - VerifyingSamIterator iterator = new VerifyingSamIterator(GATKSAMIteratorAdapter.adapt(reads.iterator())); - - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); - Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); - - // Should trigger MalformedBAM exception. - iterator.next(); - } - - private SAMSequenceRecord getContig(final int contigIndex) { - return samFileHeader.getSequence(contigIndex); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java deleted file mode 100644 index d7b9b3dda..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/phonehome/GATKRunReportUnitTest.java +++ /dev/null @@ -1,310 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.phonehome; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.tools.walkers.qc.CountLoci; -import org.broadinstitute.gatk.tools.walkers.qc.CountRODs; -import org.broadinstitute.gatk.tools.walkers.qc.CountReads; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.jets3t.service.S3Service; -import org.jets3t.service.S3ServiceException; -import org.jets3t.service.ServiceException; -import org.jets3t.service.model.S3Object; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.FileInputStream; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Properties; - -public class GATKRunReportUnitTest extends BaseTest { - private final static boolean DEBUG = false; - private static final long S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING = 30 * 1000; - private static final String AWS_DOWNLOADER_CREDENTIALS_PROPERTIES_FILE = privateTestDir + "phonehome/awsDownloaderCredentials.properties"; - - private Walker walker; - private Exception exception; - private GenomeAnalysisEngine engine; - private String downloaderAccessKey; - private String downloaderSecretKey; - - @BeforeClass - public void setup() throws Exception { - walker = new CountReads(); - exception = new IllegalArgumentException("javaException"); - engine = new GenomeAnalysisEngine(); - engine.setArguments(new GATKArgumentCollection()); - - Properties awsProperties = new Properties(); - awsProperties.load(new FileInputStream(AWS_DOWNLOADER_CREDENTIALS_PROPERTIES_FILE)); - downloaderAccessKey = awsProperties.getProperty("accessKey"); - downloaderSecretKey = awsProperties.getProperty("secretKey"); - } - - @Test(enabled = ! DEBUG) - public void testAWSKeysAreValid() { - // throws an exception if they aren't - GATKRunReport.checkAWSAreValid(); - } - - @Test(enabled = ! DEBUG) - public void testAccessKey() throws Exception { - testAWSKey(GATKRunReport.getAWSUploadAccessKey(), GATKRunReport.AWS_ACCESS_KEY_MD5); - } - - @Test(enabled = ! DEBUG) - public void testSecretKey() throws Exception { - testAWSKey(GATKRunReport.getAWSUploadSecretKey(), GATKRunReport.AWS_SECRET_KEY_MD5); - } - - private void testAWSKey(final String accessKey, final String expectedMD5) throws Exception { - Assert.assertNotNull(accessKey, "AccessKey should not be null"); - final String actualmd5 = Utils.calcMD5(accessKey); - Assert.assertEquals(actualmd5, expectedMD5); - } - - @DataProvider(name = "GATKReportCreationTest") - public Object[][] makeGATKReportCreationTest() { - List tests = new ArrayList(); - - final Walker readWalker = new CountReads(); - final Walker lociWalker = new CountLoci(); - final Walker rodWalker = new CountRODs(); - final Walker artWalker = new RunReportDummyActiveRegionWalker(); - - final Exception noException = null; - final Exception javaException = new IllegalArgumentException("javaException"); - final Exception stingException = new ReviewedGATKException("GATKException"); - final Exception userException = new UserException("userException"); - - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setArguments(new GATKArgumentCollection()); - - for ( final Walker walker : Arrays.asList(readWalker, lociWalker, rodWalker, artWalker) ) { - for ( final Exception exception : Arrays.asList(noException, javaException, stingException, userException) ) { - tests.add(new Object[]{walker, exception, engine}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "GATKReportCreationTest") - public void testGATKReportCreationReadingAndWriting(final Walker walker, final Exception exception, final GenomeAnalysisEngine engine) throws Exception { - final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.STDOUT); - final ByteArrayOutputStream captureStream = new ByteArrayOutputStream(); - final boolean succeeded = report.postReportToStream(captureStream); - Assert.assertTrue(succeeded, "Failed to write report to stream"); - Assert.assertFalse(report.exceptionOccurredDuringPost(), "Post succeeded but report says it failed"); - Assert.assertNull(report.getErrorMessage(), "Post succeeded but there was an error message"); - Assert.assertNull(report.getErrorThrown(), "Post succeeded but there was an error message"); - final InputStream readStream = new ByteArrayInputStream(captureStream.toByteArray()); - - GATKRunReport deserialized = null; - try { - deserialized = GATKRunReport.deserializeReport(readStream); - } catch ( Exception e ) { - final String reportString = new String(captureStream.toByteArray()); - Assert.fail("Failed to deserialize GATK report " + reportString + " with exception " + e); - } - - if ( deserialized != null ) - Assert.assertEquals(report, deserialized); - } - - @DataProvider(name = "GATKAWSReportMode") - public Object[][] makeGATKAWSReportMode() { - List tests = new ArrayList(); - - for ( final GATKRunReport.AWSMode mode : GATKRunReport.AWSMode.values() ) { - tests.add(new Object[]{mode}); - } - - return tests.toArray(new Object[][]{}); - } - - // Will fail with timeout if AWS time out isn't working - // Will fail with exception if AWS doesn't protect itself from errors - @Test(enabled = ! DEBUG, dataProvider = "GATKAWSReportMode", timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) - public void testAWS(final GATKRunReport.AWSMode awsMode) { - logger.warn("Starting testAWS mode=" + awsMode); - - // Use a shorter timeout than usual when we're testing GATKRunReport.AWSMode.TIMEOUT - final long thisTestS3Timeout = awsMode == GATKRunReport.AWSMode.TIMEOUT ? 30 * 1000 : S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING; - final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, thisTestS3Timeout); - report.sendAWSToTestBucket(); - report.setAwsMode(awsMode); - final S3Object s3Object = report.postReportToAWSS3(); - - if ( awsMode == GATKRunReport.AWSMode.NORMAL ) { - Assert.assertNotNull(s3Object, "Upload to AWS failed, s3Object was null. error was " + report.formatError()); - Assert.assertFalse(report.exceptionOccurredDuringPost(), "The upload should have succeeded but the report says it didn't. Error was " + report.formatError()); - Assert.assertNull(report.getErrorMessage(), "Report succeeded but an error message was found"); - Assert.assertNull(report.getErrorThrown(), "Report succeeded but an thrown error was found"); - try { - final GATKRunReport deserialized = GATKRunReport.deserializeReport(downloaderAccessKey, downloaderSecretKey, report.getS3ReportBucket(), s3Object); - Assert.assertEquals(report, deserialized); - deleteFromS3(report); - } catch ( Exception e ) { - Assert.fail("Failed to read, deserialize, or delete GATK report " + s3Object.getName() + " with exception " + e); - } - } else { - Assert.assertNull(s3Object, "AWS upload should have failed for mode " + awsMode + " but got non-null s3 object back " + s3Object + " error was " + report.formatError()); - Assert.assertTrue(report.exceptionOccurredDuringPost(), "S3 object was null but the report says that the upload succeeded"); - Assert.assertNotNull(report.getErrorMessage(), "Report succeeded but an error message wasn't found"); - if ( awsMode == GATKRunReport.AWSMode.FAIL_WITH_EXCEPTION ) - Assert.assertNotNull(report.getErrorThrown()); - } - } - - private void deleteFromS3(final GATKRunReport report) throws Exception { - final S3Service s3Service = GATKRunReport.initializeAWSService(downloaderAccessKey, downloaderSecretKey); - // Retrieve the whole data object we created previously - s3Service.deleteObject(report.getS3ReportBucket(), report.getReportFileName()); - } - - @DataProvider(name = "PostReportByType") - public Object[][] makePostReportByType() { - List tests = new ArrayList(); - - for ( final GATKRunReport.PhoneHomeOption et : GATKRunReport.PhoneHomeOption.values() ) { - tests.add(new Object[]{et}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = ! DEBUG, dataProvider = "PostReportByType", timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) - public void testPostReportByType(final GATKRunReport.PhoneHomeOption type) { - final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING); - Assert.assertFalse(report.exceptionOccurredDuringPost(), "An exception occurred during posting the report"); - final boolean succeeded = report.postReport(type); - - if ( type == GATKRunReport.PhoneHomeOption.NO_ET ) - Assert.assertFalse(succeeded, "NO_ET option shouldn't write a report"); - else { - Assert.assertTrue(succeeded, "Any non NO_ET option should succeed in writing a report"); - - if ( type == GATKRunReport.PhoneHomeOption.STDOUT ) { - // nothing to do - } else { - // must have gone to AWS - try { - Assert.assertTrue(report.wentToAWS(), "The report should have gone to AWS but the report says it wasn't"); - deleteFromS3(report); - } catch ( Exception e ) { - Assert.fail("Failed delete GATK report " + report.getReportFileName() + " with exception " + e); - } - } - } - } - - public interface S3Op { - public void apply() throws ServiceException; - } - - // Will fail with timeout if AWS time out isn't working - // Will fail with exception if AWS doesn't protect itself from errors - @Test(timeOut = S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING * 2) - public void testAWSPublicKeyHasAccessControls() throws Exception { - final GATKRunReport report = new GATKRunReport(walker, exception, engine, GATKRunReport.PhoneHomeOption.AWS, S3_PUT_TIMEOUT_IN_MILLISECONDS_FOR_TESTING); - report.sendAWSToTestBucket(); - final S3Object s3Object = report.postReportToAWSS3(); - Assert.assertNotNull(s3Object, "Upload to AWS failed, s3Object was null. error was " + report.formatError()); - - // create a service with the public key, and make sure it cannot list or delete - final S3Service s3Service = GATKRunReport.initializeAWSService(GATKRunReport.getAWSUploadAccessKey(), GATKRunReport.getAWSUploadSecretKey()); - assertOperationNotAllowed("listAllBuckets", new S3Op() { - @Override - public void apply() throws S3ServiceException { - s3Service.listAllBuckets(); - } - }); - assertOperationNotAllowed("listBucket", new S3Op() { - @Override - public void apply() throws S3ServiceException { s3Service.listObjects(report.getS3ReportBucket()); } - }); - assertOperationNotAllowed("createBucket", new S3Op() { - @Override - public void apply() throws S3ServiceException { s3Service.createBucket("ShouldNotCreate"); } - }); - assertOperationNotAllowed("deleteObject", new S3Op() { - @Override - public void apply() throws ServiceException { s3Service.deleteObject(report.getS3ReportBucket(), report.getReportFileName()); } - }); - } - - private void assertOperationNotAllowed(final String name, final S3Op op) { - try { - op.apply(); - // only gets here if the operation was successful - Assert.fail("Operation " + name + " ran successfully but we expected to it fail"); - } catch ( ServiceException e ) { - Assert.assertEquals(e.getErrorCode(), "AccessDenied"); - } - } - - class RunReportDummyActiveRegionWalker extends ActiveRegionWalker { - @Override - public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - return new ActivityProfileState(ref.getLocus(), 0.0); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTrackerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTrackerUnitTest.java deleted file mode 100644 index f25ab8d58..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/RefMetaDataTrackerUnitTest.java +++ /dev/null @@ -1,290 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata; - -import htsjdk.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.RodBinding; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.codecs.table.TableFeature; -import org.broadinstitute.gatk.engine.refdata.utils.GATKFeature; -import org.broadinstitute.gatk.engine.refdata.utils.RODRecordList; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.*; -import java.util.*; -import java.util.List; - -public class RefMetaDataTrackerUnitTest { - final protected static Logger logger = Logger.getLogger(RefMetaDataTrackerUnitTest.class); - private static SAMFileHeader header; - private ReferenceContext context; - private GenomeLocParser genomeLocParser; - private GenomeLoc locus; - private final static int START_POS = 10; - Allele A,C,G,T; - VariantContext AC_SNP, AG_SNP, AT_SNP; - TableFeature span10_10, span1_20, span10_20; - - @BeforeClass - public void beforeClass() { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - locus = genomeLocParser.createGenomeLoc("chr1", START_POS, START_POS); - context = new ReferenceContext(genomeLocParser, locus, (byte)'A'); - A = Allele.create("A", true); - C = Allele.create("C"); - G = Allele.create("G"); - T = Allele.create("T"); - AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)).make(); - AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)).make(); - AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)).make(); - span10_10 = makeSpan(10, 10); - span1_20 = makeSpan(1, 20); - span10_20 = makeSpan(10, 20); - } - - @BeforeMethod - public void reset() { - RodBinding.resetNameCounter(); - } - - private class MyTest extends BaseTest.TestDataProvider { - public RODRecordList AValues, BValues; - - private MyTest(Class c, final List AValues, final List BValues) { - super(c); - this.AValues = AValues == null ? null : makeRODRecord("A", AValues); - this.BValues = BValues == null ? null : makeRODRecord("B", BValues); - } - - private MyTest(final List AValues, final List BValues) { - super(MyTest.class); - this.AValues = AValues == null ? null : makeRODRecord("A", AValues); - this.BValues = BValues == null ? null : makeRODRecord("B", BValues); - } - - @Override - public String toString() { - return String.format("A=%s, B=%s", AValues, BValues); - } - - private final RODRecordList makeRODRecord(String name, List features) { - List x = new ArrayList(); - for ( Feature f : features ) - x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); - return new RODRecordListImpl(name, x, locus); - } - - public List expected(String name) { - if ( name.equals("A+B") ) return allValues(); - if ( name.equals("A") ) return expectedAValues(); - if ( name.equals("B") ) return expectedBValues(); - throw new RuntimeException("FAIL"); - } - - public List allValues() { - List x = new ArrayList(); - x.addAll(expectedAValues()); - x.addAll(expectedBValues()); - return x; - } - - public List expectedAValues() { - return AValues == null ? Collections.emptyList() : AValues; - } - - public List expectedBValues() { - return BValues == null ? Collections.emptyList() : BValues; - } - - public RefMetaDataTracker makeTracker() { - List x = new ArrayList(); - if ( AValues != null ) x.add(AValues); - if ( BValues != null ) x.add(BValues); - return new RefMetaDataTracker(x); - } - - public int nBoundTracks() { - int n = 0; - if ( AValues != null ) n++; - if ( BValues != null ) n++; - return n; - } - } - - private final TableFeature makeSpan(int start, int stop) { - return new TableFeature(genomeLocParser.createGenomeLoc("chr1", start, stop), - Collections.emptyList(), Collections.emptyList()); - } - - @DataProvider(name = "tests") - public Object[][] createTests() { - new MyTest(null, null); - new MyTest(Arrays.asList(AC_SNP), null); - new MyTest(Arrays.asList(AC_SNP, AT_SNP), null); - new MyTest(Arrays.asList(AC_SNP), Arrays.asList(AG_SNP)); - new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(AG_SNP)); - new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10)); - new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10, span10_20)); - new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10, span10_20, span1_20)); - - // for requires starts - new MyTest(Arrays.asList(span1_20), null); - new MyTest(Arrays.asList(span10_10, span10_20), null); - new MyTest(Arrays.asList(span10_10, span10_20, span1_20), null); - - return MyTest.getTests(MyTest.class); - } - - @Test(enabled = true, dataProvider = "tests") - public void testRawBindings(MyTest test) { - logger.warn("Testing " + test + " for number of bound tracks"); - RefMetaDataTracker tracker = test.makeTracker(); - Assert.assertEquals(tracker.getNTracksWithBoundFeatures(), test.nBoundTracks()); - - testSimpleBindings("A", tracker, test.AValues); - testSimpleBindings("B", tracker, test.BValues); - } - - private void testSimpleBindings(String name, RefMetaDataTracker tracker, RODRecordList expected) { - List asValues = tracker.getValues(Feature.class, name); - - Assert.assertEquals(tracker.hasValues(name), expected != null); - Assert.assertEquals(asValues.size(), expected == null ? 0 : expected.size()); - - if ( expected != null ) { - for ( GATKFeature e : expected ) { - boolean foundValue = false; - for ( Feature f : asValues ) { - if ( e.getUnderlyingObject() == f ) foundValue = true; - } - Assert.assertTrue(foundValue, "Never found expected value of " + e.getUnderlyingObject() + " bound to " + name + " in " + tracker); - } - } - } - - @Test(enabled = true, dataProvider = "tests") - public void testGettersAsString(MyTest test) { - logger.warn("Testing " + test + " for get() methods"); - RefMetaDataTracker tracker = test.makeTracker(); - - for ( String name : Arrays.asList("A+B", "A", "B") ) { - List v1 = name.equals("A+B") ? tracker.getValues(Feature.class) : tracker.getValues(Feature.class, name); - testGetter(name, v1, test.expected(name), true, tracker); - - List v2 = name.equals("A+B") ? tracker.getValues(Feature.class, locus) : tracker.getValues(Feature.class, name, locus); - testGetter(name, v2, startingHere(test.expected(name)), true, tracker); - - Feature v3 = name.equals("A+B") ? tracker.getFirstValue(Feature.class) : tracker.getFirstValue(Feature.class, name); - testGetter(name, Arrays.asList(v3), test.expected(name), false, tracker); - - Feature v4 = name.equals("A+B") ? tracker.getFirstValue(Feature.class, locus) : tracker.getFirstValue(Feature.class, name, locus); - testGetter(name, Arrays.asList(v4), startingHere(test.expected(name)), false, tracker); - } - } - - @Test(enabled = true, dataProvider = "tests") - public void testGettersAsRodBindings(MyTest test) { - logger.warn("Testing " + test + " for get() methods as RodBindings"); - RefMetaDataTracker tracker = test.makeTracker(); - - for ( String nameAsString : Arrays.asList("A", "B") ) { - RodBinding binding = new RodBinding(Feature.class, nameAsString, "none", "vcf", new Tags()); - List v1 = tracker.getValues(binding); - testGetter(nameAsString, v1, test.expected(nameAsString), true, tracker); - - List v2 = tracker.getValues(binding, locus); - testGetter(nameAsString, v2, startingHere(test.expected(nameAsString)), true, tracker); - - Feature v3 = tracker.getFirstValue(binding); - testGetter(nameAsString, Arrays.asList(v3), test.expected(nameAsString), false, tracker); - - Feature v4 = tracker.getFirstValue(binding, locus); - testGetter(nameAsString, Arrays.asList(v4), startingHere(test.expected(nameAsString)), false, tracker); - } - } - - @Test(enabled = true, dataProvider = "tests") - public void testGettersAsListOfRodBindings(MyTest test) { - logger.warn("Testing " + test + " for get() methods for List"); - RefMetaDataTracker tracker = test.makeTracker(); - - String nameAsString = "A+B"; - RodBinding A = new RodBinding(Feature.class, "A", "none", "vcf", new Tags()); - RodBinding B = new RodBinding(Feature.class, "B", "none", "vcf", new Tags()); - List> binding = Arrays.asList(A, B); - - List v1 = tracker.getValues(binding); - testGetter(nameAsString, v1, test.expected(nameAsString), true, tracker); - - List v2 = tracker.getValues(binding, locus); - testGetter(nameAsString, v2, startingHere(test.expected(nameAsString)), true, tracker); - - Feature v3 = tracker.getFirstValue(binding); - testGetter(nameAsString, Arrays.asList(v3), test.expected(nameAsString), false, tracker); - - Feature v4 = tracker.getFirstValue(binding, locus); - testGetter(nameAsString, Arrays.asList(v4), startingHere(test.expected(nameAsString)), false, tracker); - } - - private List startingHere(List l) { - List x = new ArrayList(); - for ( GATKFeature f : l ) if ( f.getStart() == locus.getStart() ) x.add(f); - return x; - } - - private void testGetter(String name, List got, List expected, boolean requireExact, RefMetaDataTracker tracker) { - if ( got.size() == 1 && got.get(0) == null ) - got = Collections.emptyList(); - - if ( requireExact ) - Assert.assertEquals(got.size(), expected.size()); - - boolean foundAny = false; - for ( GATKFeature e : expected ) { - boolean found1 = false; - for ( Feature got1 : got ) { - if ( e.getUnderlyingObject() == got1 ) - found1 = true; - } - if ( requireExact ) - Assert.assertTrue(found1, "Never found expected GATKFeature " + e + " bound to " + name + " in " + tracker); - foundAny = found1 || foundAny; - } - - if ( ! requireExact && ! expected.isEmpty() ) - Assert.assertTrue(foundAny, "Never found any got values matching one of the expected values bound to " + name + " in " + tracker); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManagerUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManagerUnitTest.java deleted file mode 100644 index ec3b470f8..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/tracks/FeatureManagerUnitTest.java +++ /dev/null @@ -1,163 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.codecs.table.BedTableCodec; -import org.broadinstitute.gatk.utils.codecs.table.TableFeature; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import htsjdk.variant.vcf.VCF3Codec; -import htsjdk.variant.vcf.VCFCodec; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import htsjdk.variant.variantcontext.VariantContext; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.*; -import java.util.*; - - -/** - * @author depristo - * - * UnitTests for RMD FeatureManager - */ -public class FeatureManagerUnitTest extends BaseTest { - private static final File RANDOM_FILE = new File(publicTestDir+ "exampleGATKReport.eval"); - private static final File VCF3_FILE = new File(privateTestDir + "vcf3.vcf"); - private static final File VCF4_FILE = new File(privateTestDir + "HiSeq.10000.vcf"); - private static final File VCF4_FILE_GZ = new File(privateTestDir + "HiSeq.10000.vcf.gz"); - private static final File VCF4_FILE_BGZIP = new File(privateTestDir + "HiSeq.10000.bgzip.vcf.gz"); - - private FeatureManager manager; - private GenomeLocParser genomeLocParser; - - @BeforeMethod - public void setup() { - File referenceFile = new File(b36KGReference); - try { - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); - genomeLocParser = new GenomeLocParser(seq); - manager = new FeatureManager(); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - } - - @Test - public void testManagerCreation() { - Assert.assertTrue(manager.getFeatureDescriptors().size() > 0); - } - - private class FMTest extends BaseTest.TestDataProvider { - public Class codec; - public Class feature; - public String name; - public File associatedFile; - - private FMTest(final Class feature, final Class codec, final String name, final File file) { - super(FMTest.class); - this.codec = codec; - this.feature = feature; - this.name = name; - this.associatedFile = file; - } - - public void assertExpected(FeatureManager.FeatureDescriptor featureDescriptor) { - Assert.assertEquals(featureDescriptor.getCodecClass(), codec); - Assert.assertEquals(featureDescriptor.getFeatureClass(), feature); - Assert.assertEquals(featureDescriptor.getName().toLowerCase(), name.toLowerCase()); - } - - public String toString() { - return String.format("FMTest name=%s codec=%s feature=%s file=%s", - name, codec.getSimpleName(), feature.getSimpleName(), associatedFile); - } - } - - @DataProvider(name = "tests") - public Object[][] createTests() { - new FMTest(VariantContext.class, VCF3Codec.class, "VCF3", VCF3_FILE); - new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE); - new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_GZ); - new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_BGZIP); - new FMTest(TableFeature.class, BedTableCodec.class, "bedtable", null); - return FMTest.getTests(FMTest.class); - } - - @Test(dataProvider = "tests") - public void testGetByFile(FMTest params) { - if ( params.associatedFile != null ) { - FeatureManager.FeatureDescriptor byFile = manager.getByFiletype(params.associatedFile); - Assert.assertNotNull(byFile, "Couldn't find any type associated with file " + params.associatedFile); - params.assertExpected(byFile); - } - } - - @Test - public void testGetByFileNoMatch() { - FeatureManager.FeatureDescriptor byFile = manager.getByFiletype(RANDOM_FILE); - Assert.assertNull(byFile, "Found type " + byFile + " associated with RANDOM, non-Tribble file " + RANDOM_FILE); - } - - @Test(dataProvider = "tests") - public void testGetters(FMTest params) { - params.assertExpected(manager.getByCodec(params.codec)); - params.assertExpected(manager.getByName(params.name)); - params.assertExpected(manager.getByName(params.name.toLowerCase())); - params.assertExpected(manager.getByName(params.name.toUpperCase())); - - Collection descriptors = manager.getByFeature(params.feature); - Assert.assertTrue(descriptors.size() > 0, "Look up by FeatureClass failed"); - } - - @Test - public void testUserFriendlyList() { - Assert.assertTrue(manager.userFriendlyListOfAvailableFeatures().length() > 0, "Expected at least one codec to be listed"); - Assert.assertTrue(manager.userFriendlyListOfAvailableFeatures().split(",").length > 0, "Expected at least two codecs, but only saw one"); - } - - @Test - public void testCodecCreation() { - FeatureManager.FeatureDescriptor descriptor = manager.getByName("vcf"); - Assert.assertNotNull(descriptor, "Couldn't find VCF feature descriptor!"); - - FeatureCodec c = manager.createCodec(descriptor, "foo", genomeLocParser, null); - Assert.assertNotNull(c, "Couldn't create codec"); - Assert.assertEquals(c.getClass(), descriptor.getCodecClass()); - Assert.assertEquals(c.getFeatureType(), descriptor.getFeatureClass()); - } - -} - diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilderUnitTest.java deleted file mode 100644 index a64773af8..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/tracks/RMDTrackBuilderUnitTest.java +++ /dev/null @@ -1,190 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.tracks; - - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.tribble.Tribble; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.util.LittleEndianOutputStream; -import htsjdk.variant.vcf.VCFCodec; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; - -import java.io.*; -import java.nio.channels.FileChannel; - - -/** - * @author aaron - *

- * Class RMDTrackBuilderUnitTest - *

- * Testing out the builder for tribble Tracks - */ -public class RMDTrackBuilderUnitTest extends BaseTest { - private RMDTrackBuilder builder; - private IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - - @BeforeMethod - public void setup() { - File referenceFile = new File(b37KGReference); - try { - seq = new CachingIndexedFastaSequenceFile(referenceFile); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - genomeLocParser = new GenomeLocParser(seq); - - // We have to disable auto-index creation/locking in the RMDTrackBuilder for tests, - // as the lock acquisition calls were intermittently hanging on our farm. This unfortunately - // means that we can't include tests for the auto-index creation feature. - builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); - } - - @Test - public void testBuilder() { - Assert.assertTrue(builder.getFeatureManager().getFeatureDescriptors().size() > 0); - } - - @Test - public void testDisableAutoIndexGeneration() throws IOException { - final File unindexedVCF = new File(privateTestDir + "unindexed.vcf"); - final File unindexedVCFIndex = Tribble.indexFile(unindexedVCF); - - Index index = builder.loadIndex(unindexedVCF, new VCFCodec()); - - Assert.assertFalse(unindexedVCFIndex.exists()); - Assert.assertNotNull(index); - } - - @Test - public void testLoadOnDiskIndex() { - final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); - final File tempVCFWithCorrectIndex = createTempVCFFileAndIndex(originalVCF, false); - final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithCorrectIndex); - - final Index index = builder.loadFromDisk(tempVCFWithCorrectIndex, tempVCFIndexFile); - - Assert.assertNotNull(index); - Assert.assertTrue(tempVCFIndexFile.exists()); - - final Index inMemoryIndex = builder.createIndexInMemory(tempVCFWithCorrectIndex, new VCFCodec()); - Assert.assertTrue(index.equalsIgnoreProperties(inMemoryIndex)); - } - - @Test - public void testLoadOnDiskOutdatedIndex() { - final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); - final File tempVCFWithOutdatedIndex = createTempVCFFileAndIndex(originalVCF, true); - final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithOutdatedIndex); - - final Index index = builder.loadFromDisk(tempVCFWithOutdatedIndex, tempVCFIndexFile); - - // loadFromDisk() should return null to indicate that the index is outdated and should not be used, - // but should not delete the index since our builder has disableAutoIndexCreation set to true - Assert.assertNull(index); - Assert.assertTrue(tempVCFIndexFile.exists()); - } - - /** - * Create a temporary vcf file and an associated index file, which may be set to be out-of-date - * relative to the vcf - * - * @param vcfFile the vcf file - * @param createOutOfDateIndex if true, ensure that the temporary vcf file is modified after the index - * @return a file pointing to the new tmp location, with accompanying index - */ - private File createTempVCFFileAndIndex( final File vcfFile, final boolean createOutOfDateIndex ) { - try { - final File tmpFile = createTempFile("RMDTrackBuilderUnitTest", ""); - final File tmpIndex = Tribble.indexFile(tmpFile); - tmpIndex.deleteOnExit(); - - copyFile(vcfFile, tmpFile); - final Index inMemoryIndex = builder.createIndexInMemory(tmpFile, new VCFCodec()); - final LittleEndianOutputStream indexOutputStream = new LittleEndianOutputStream(new FileOutputStream(tmpIndex)); - - // If requested, modify the tribble file after the index. Otherwise, modify the index last. - if ( createOutOfDateIndex ) { - inMemoryIndex.write(indexOutputStream); - indexOutputStream.close(); - Thread.sleep(2000); - copyFile(vcfFile, tmpFile); - } - else { - copyFile(vcfFile, tmpFile); - Thread.sleep(2000); - inMemoryIndex.write(indexOutputStream); - indexOutputStream.close(); - } - - return tmpFile; - } catch (IOException e) { - Assert.fail("Unable to create temperary file"); - } catch (InterruptedException e) { - Assert.fail("Somehow our thread got interrupted"); - } - return null; - } - - /** - * copy a file, from http://www.exampledepot.com/egs/java.nio/File2File.html - * - * @param srFile the source file - * @param dtFile the destination file - */ - private static void copyFile(File srFile, File dtFile) { - try { - // Create channel on the source - FileChannel srcChannel = new FileInputStream(srFile).getChannel(); - - // Create channel on the destination - FileChannel dstChannel = new FileOutputStream(dtFile).getChannel(); - - // Copy file contents from source to destination - dstChannel.transferFrom(srcChannel, 0, srcChannel.size()); - - // Close the channels - srcChannel.close(); - dstChannel.close(); - } catch (IOException e) { - e.printStackTrace(); - Assert.fail("Unable to process copy " + e.getMessage()); - } - } - -} - diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/CheckableCloseableTribbleIterator.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/CheckableCloseableTribbleIterator.java deleted file mode 100644 index e77c0797e..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/CheckableCloseableTribbleIterator.java +++ /dev/null @@ -1,90 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.tribble.CloseableTribbleIterator; -import htsjdk.tribble.Feature; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -/** - * Adapter to allow checking if the wrapped iterator was closed. - * Creating an CCTI also adds it to the list returned from getThreadIterators(). - * @param feature - */ -public class CheckableCloseableTribbleIterator implements CloseableTribbleIterator { - private final CloseableTribbleIterator iterator; - private boolean closed = false; - - private static ThreadLocal>> threadIterators = - new ThreadLocal>>() { - @Override - protected List> initialValue() { - return new ArrayList>(); - } - }; - - public CheckableCloseableTribbleIterator(CloseableTribbleIterator iterator) { - this.iterator = iterator; - threadIterators.get().add(this); - } - - /** - * Returns the list of iterators created on this thread since the last time clearCreatedIterators() was called. - * @return the list of iterators created on this thread since the last time clearCreatedIterators() was called. - */ - public static List> getThreadIterators() { - return threadIterators.get(); - } - - /** - * Clears the tracked list of iterators created on this thread. - */ - public static void clearThreadIterators() { - threadIterators.get().clear(); - } - - @Override - public void close() { - iterator.close(); - this.closed = true; - } - - /** - * Returns true if this iterator was properly closed. - * @return true if this iterator was properly closed. - */ - public boolean isClosed() { - return closed; - } - - @Override public Iterator iterator() { return this; } - @Override public boolean hasNext() { return iterator.hasNext(); } - @Override public T next() { return iterator.next(); } - @Override public void remove() { iterator.remove(); } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java deleted file mode 100644 index d95c320cb..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java +++ /dev/null @@ -1,61 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import htsjdk.variant.vcf.VCFCodec; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; - -public class FeatureToGATKFeatureIteratorUnitTest extends BaseTest { - @Test - @SuppressWarnings("unchecked") - public void testCloseFilePointers() throws IOException { - final String chr = "20"; - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); - GenomeLocParser parser = new GenomeLocParser(seq); - File file = new File(privateTestDir + "NA12878.hg19.example1.vcf"); - VCFCodec codec = new VCFCodec(); - TestFeatureReader reader = new TestFeatureReader(file.getAbsolutePath(), codec); - CheckableCloseableTribbleIterator tribbleIterator = reader.query(chr, 1, 100000); - FeatureToGATKFeatureIterator gatkIterator = new FeatureToGATKFeatureIterator(parser, tribbleIterator, "test"); - Assert.assertTrue(gatkIterator.hasNext(), "GATK feature iterator does not have a next value."); - GenomeLoc gatkLocation = gatkIterator.next().getLocation(); - Assert.assertEquals(gatkLocation.getContig(), chr, "Instead of chr 20 rod iterator was at location " + gatkLocation); - Assert.assertFalse(tribbleIterator.isClosed(), "Tribble iterator is closed but should be still open."); - gatkIterator.close(); - Assert.assertTrue(tribbleIterator.isClosed(), "Tribble iterator is open but should be now closed."); - reader.close(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIteratorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIteratorUnitTest.java deleted file mode 100644 index 7aa07ef58..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/FlashBackIteratorUnitTest.java +++ /dev/null @@ -1,364 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMSequenceDictionary; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.refdata.ReferenceOrderedDatum; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.AbstractList; -import java.util.ArrayList; -import java.util.List; - - -/** - * @author aaron - *

- * Class FlashBackIteratorUnitTest - *

- * just like a greatful dead show...this will be prone to flashbacks - */ -public class FlashBackIteratorUnitTest extends BaseTest { - private SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); - private static final int NUMBER_OF_CHROMOSOMES = 5; - private static final int STARTING_CHROMOSOME = 1; - private static final int CHROMOSOME_SIZE = 1000; - - private String firstContig; - private GenomeLocParser genomeLocParser; - - @BeforeMethod - public void setup() { - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - firstContig = header.getSequenceDictionary().getSequence(0).getSequenceName(); - } - - @Test - public void testBasicIteration() { - GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); - FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); - GenomeLoc lastLocation = null; - for (int x = 0; x < 10; x++) { - iter.next(); - GenomeLoc cur = iter.position(); - if (lastLocation != null) { - Assert.assertTrue(lastLocation.isBefore(cur)); - } - lastLocation = cur; - } - } - - @Test - public void testBasicIterationThenFlashBack() { - GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); - FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); - GenomeLoc lastLocation = null; - for (int x = 0; x < 10; x++) { - iter.next(); - GenomeLoc cur = iter.position(); - if (lastLocation != null) { - Assert.assertTrue(lastLocation.isBefore(cur)); - } - lastLocation = cur; - } - iter.flashBackTo(genomeLocParser.createGenomeLoc(firstContig, 2)); - } - - @Test - public void testBasicIterationThenFlashBackThenIterate() { - GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); - FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); - GenomeLoc lastLocation = null; - for (int x = 0; x < 10; x++) { - iter.next(); - GenomeLoc cur = iter.position(); - if (lastLocation != null) { - Assert.assertTrue(lastLocation.isBefore(cur)); - } - lastLocation = cur; - } - iter.flashBackTo(genomeLocParser.createGenomeLoc(firstContig, 1)); - int count = 0; - while (iter.hasNext()) { - count++; - iter.next(); - } - Assert.assertEquals(count, 10); - } - - - @Test - public void testFlashBackTruth() { - GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); - LocationAwareSeekableRODIterator backIter = new FakeSeekableRODIterator(genomeLocParser,loc); - // remove the first three records - backIter.next(); - backIter.next(); - backIter.next(); - FlashBackIterator iter = new FlashBackIterator(backIter); - GenomeLoc lastLocation = null; - for (int x = 0; x < 10; x++) { - iter.next(); - GenomeLoc cur = iter.position(); - if (lastLocation != null) { - Assert.assertTrue(lastLocation.isBefore(cur)); - } - lastLocation = cur; - } - Assert.assertTrue(iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 5))); - Assert.assertTrue(iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 15))); - Assert.assertTrue(!iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 2))); - Assert.assertTrue(!iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 1))); - } - - @Test - public void testBasicIterationThenFlashBackHalfWayThenIterate() { - GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); - FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); - GenomeLoc lastLocation = null; - for (int x = 0; x < 10; x++) { - iter.next(); - GenomeLoc cur = iter.position(); - if (lastLocation != null) { - Assert.assertTrue(lastLocation.isBefore(cur)); - } - lastLocation = cur; - } - iter.flashBackTo(genomeLocParser.createGenomeLoc(firstContig, 5)); - int count = 0; - while (iter.hasNext()) { - count++; - iter.next(); - } - Assert.assertEquals(count, 6); // chr1:5, 6, 7, 8, 9, and 10 - } -} - - -class FakeSeekableRODIterator implements LocationAwareSeekableRODIterator { - private GenomeLocParser genomeLocParser; - - // current location - private GenomeLoc location; - private FakeRODatum curROD; - private int recordCount = 10; - - public FakeSeekableRODIterator(GenomeLocParser genomeLocParser,GenomeLoc startingLoc) { - this.genomeLocParser = genomeLocParser; - this.location = genomeLocParser.createGenomeLoc(startingLoc.getContig(), startingLoc.getStart() + 1, startingLoc.getStop() + 1); - } - - /** - * Gets the header associated with the backing input stream. - * @return the ROD header. - */ - @Override - public Object getHeader() { - return null; - } - - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return null; - } - - - @Override - public GenomeLoc peekNextLocation() { - System.err.println("Peek Next -> " + location); - return location; - } - - @Override - public GenomeLoc position() { - return location; - } - - @Override - public RODRecordList seekForward(GenomeLoc interval) { - this.location = interval; - return next(); - } - - @Override - public boolean hasNext() { - return (recordCount > 0); - } - - @Override - public RODRecordList next() { - RODRecordList list = new FakeRODRecordList(); - curROD = new FakeRODatum("STUPIDNAME", location); - location = genomeLocParser.createGenomeLoc(location.getContig(), location.getStart() + 1, location.getStop() + 1); - list.add(curROD); - recordCount--; - return list; - } - - @Override - public void remove() { - throw new IllegalStateException("GRRR"); - } - - @Override - public void close() { - // nothing to do - } -} - - -/** for testing only */ -class FakeRODatum extends GATKFeature implements ReferenceOrderedDatum { - - final GenomeLoc location; - - public FakeRODatum(String name, GenomeLoc location) { - super(name); - this.location = location; - } - - @Override - public String getName() { - return "false"; - } - - @Override - public boolean parseLine(Object header, String[] parts) throws IOException { - return false; - } - - @Override - public String toSimpleString() { - return ""; - } - - @Override - public String repl() { - return ""; - } - - /** - * Used by the ROD system to determine how to split input lines - * - * @return Regex string delimiter separating fields - */ - @Override - public String delimiterRegex() { - return ""; - } - - @Override - public GenomeLoc getLocation() { - return location; - } - - @Override - public Object getUnderlyingObject() { - return this; - } - - @Override - public int compareTo(ReferenceOrderedDatum that) { - return location.compareTo(that.getLocation()); - } - - /** - * Backdoor hook to read header, meta-data, etc. associated with the file. Will be - * called by the ROD system before streaming starts - * - * @param source source data file on disk from which this rod stream will be pulled - * - * @return a header object that will be passed to parseLine command - */ - @Override - public Object initialize(File source) throws FileNotFoundException { - return null; - } - - @Override - public String getChr() { - return location.getContig(); - } - - @Override - public int getStart() { - return (int)location.getStart(); - } - - @Override - public int getEnd() { - return (int)location.getStop(); - } -} - -class FakeRODRecordList extends AbstractList implements RODRecordList { - private final List list = new ArrayList(); - - public boolean add(GATKFeature data) { - return list.add(data); - } - - @Override - public GATKFeature get(int i) { - return list.get(i); - } - - @Override - public int size() { - return list.size(); - } - - @Override - public GenomeLoc getLocation() { - return list.get(0).getLocation(); - } - - @Override - public String getName() { - return "test"; - } - - @Override - public int compareTo(RODRecordList rodRecordList) { - return this.list.get(0).getLocation().compareTo(rodRecordList.getLocation()); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/TestFeatureReader.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/TestFeatureReader.java deleted file mode 100644 index 90b5e7a35..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/TestFeatureReader.java +++ /dev/null @@ -1,53 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.tribble.Feature; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.TribbleIndexedFeatureReader; - -import java.io.IOException; - -/** - * Feature reader with additional test utilities. The iterators can be checked to see if they are closed. - */ -public class TestFeatureReader extends TribbleIndexedFeatureReader { - public TestFeatureReader(String featurePath, FeatureCodec codec) throws IOException { - super(featurePath, codec, true); - } - - @Override - @SuppressWarnings("unchecked") - public CheckableCloseableTribbleIterator iterator() throws IOException { - return new CheckableCloseableTribbleIterator(super.iterator()); - } - - @Override - @SuppressWarnings("unchecked") - public CheckableCloseableTribbleIterator query(String chr, int start, int end) throws IOException { - return new CheckableCloseableTribbleIterator(super.query(chr, start, end)); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/TestRMDTrackBuilder.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/TestRMDTrackBuilder.java deleted file mode 100644 index 17179f3ba..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/refdata/utils/TestRMDTrackBuilder.java +++ /dev/null @@ -1,71 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.refdata.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.tribble.FeatureCodec; -import htsjdk.tribble.Tribble; -import htsjdk.tribble.index.Index; -import org.broadinstitute.gatk.engine.refdata.tracks.FeatureManager; -import org.broadinstitute.gatk.engine.refdata.tracks.IndexDictionaryUtils; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrack; -import org.broadinstitute.gatk.engine.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.gatk.utils.GenomeLocParser; - -import java.io.File; -import java.io.IOException; - -/** - * Extension of RMDTrackBuilder that creates TestFeatureReader's which in turn create CheckableCloseableTribbleIterator's. - */ -public class TestRMDTrackBuilder extends RMDTrackBuilder { - private GenomeLocParser genomeLocParser; - - public TestRMDTrackBuilder(SAMSequenceDictionary dict, GenomeLocParser genomeLocParser) { - // disable auto-index creation/locking in the RMDTrackBuilder for tests - super(dict, genomeLocParser, null, true, null); - this.genomeLocParser = genomeLocParser; - } - - @Override - public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { - String name = fileDescriptor.getName(); - File inputFile = new File(fileDescriptor.getFile()); - FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); - FeatureCodec codec = getFeatureManager().createCodec(descriptor, name, genomeLocParser, null); - TestFeatureReader featureReader; - Index index; - try { - // Create a feature reader that creates checkable tribble iterators. - index = loadIndex(inputFile, codec); - featureReader = new TestFeatureReader(inputFile.getAbsolutePath(), codec); - } catch (IOException e) { - throw new RuntimeException(e); - } - SAMSequenceDictionary sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); - return new RMDTrack(descriptor.getCodecClass(), name, inputFile, featureReader, sequenceDictionary, genomeLocParser, codec); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/report/GATKReportUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/report/GATKReportUnitTest.java deleted file mode 100644 index c28e901d2..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/report/GATKReportUnitTest.java +++ /dev/null @@ -1,285 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.report; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.Random; -import java.io.FileInputStream; -import java.io.DataInputStream; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.util.ArrayList; - - -public class GATKReportUnitTest extends BaseTest { - @Test - public void testParse() throws Exception { - String reportPath = publicTestDir + "exampleGATKReportv2.tbl"; - GATKReport report = new GATKReport(reportPath); - Assert.assertEquals(report.getVersion(), GATKReportVersion.V1_1); - Assert.assertEquals(report.getTables().size(), 5); - - GATKReportTable countVariants = report.getTable("CountVariants"); - Assert.assertEquals(countVariants.get(0, "nProcessedLoci"), "63025520"); - Assert.assertEquals(countVariants.get(0, "nNoCalls"), "0"); - Assert.assertEquals(countVariants.get(0, "heterozygosity"), 4.73e-06); - - GATKReportTable validationReport = report.getTable("ValidationReport"); - Assert.assertEquals(validationReport.get(2, "PPV"), Double.NaN); - } - - @DataProvider(name = "rightAlignValues") - public Object[][] getRightAlignValues() { - return new Object[][]{ - new Object[]{null, true}, - new Object[]{"null", true}, - new Object[]{"NA", true}, - new Object[]{"0", true}, - new Object[]{"0.0", true}, - new Object[]{"-0", true}, - new Object[]{"-0.0", true}, - new Object[]{String.valueOf(Long.MAX_VALUE), true}, - new Object[]{String.valueOf(Long.MIN_VALUE), true}, - new Object[]{String.valueOf(Float.MIN_NORMAL), true}, - new Object[]{String.valueOf(Double.MAX_VALUE), true}, - new Object[]{String.valueOf(Double.MIN_VALUE), true}, - new Object[]{String.valueOf(Double.POSITIVE_INFINITY), true}, - new Object[]{String.valueOf(Double.NEGATIVE_INFINITY), true}, - new Object[]{String.valueOf(Double.NaN), true}, - new Object[]{"hello", false} - }; - } - - @Test(dataProvider = "rightAlignValues") - public void testIsRightAlign(String value, boolean expected) { - Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); - } - - private GATKReportTable getTableWithRandomValues() { - Random number = new Random(123L); - final int VALUESRANGE = 10; - - GATKReport report = GATKReport.newSimpleReport("TableName", "col1", "col2", "col3"); - GATKReportTable table = new GATKReportTable("testSortingTable", "table with random values sorted by columns", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN ); - - final int NUMROWS = 100; - for (int x = 0; x < NUMROWS; x++) { - report.addRow(number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE)); - } - return table; - } - - @Test(enabled = true) - public void testSortingByColumn() { - Assert.assertEquals(isSorted(getTableWithRandomValues()), true); - } - - private boolean isSorted(GATKReportTable table) { - boolean result = true; - File testingSortingTableFile = new File("testSortingFile.txt"); - - try { - // Connect print stream to the output stream - PrintStream ps = new PrintStream(testingSortingTableFile); - table.write(ps); - ps.close(); - } - catch (Exception e){ - System.err.println ("Error: " + e.getMessage()); - } - - ArrayList rows = new ArrayList(); - try { - // Open the file - FileInputStream fStream = new FileInputStream(testingSortingTableFile); - // Get the object of DataInputStream - DataInputStream in = new DataInputStream(fStream); - BufferedReader br = new BufferedReader(new InputStreamReader(in)); - String strLine; - //Read File Line By Line - while ((strLine = br.readLine()) != null) { - - String[] parts = strLine.split(" "); - int l = parts.length; - int[] row = new int[l]; - for(int n = 0; n < l; n++) { - row[n] = Integer.parseInt(parts[n]); - } - rows.add(row); - } - //Close the input stream - in.close(); - } catch (Exception e){//Catch exception if any - System.err.println("Error: " + e.getMessage()); - } - for (int x = 1; x < rows.size() && result; x++) { - result = checkRowOrder(rows.get(x - 1), rows.get(x)); - } - return result; - } - - private boolean checkRowOrder(int[] row1, int[] row2) { - int l = row1.length; - final int EQUAL = 0; - - int result = EQUAL; - - for(int x = 0; x < l && ( result <= EQUAL); x++) { - result = ((Integer)row1[x]).compareTo(row2[x]); - } - if (result <= EQUAL) { - return true; - } else { - return false; - } - } - - private GATKReportTable makeBasicTable() { - GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value"); - GATKReportTable table = report.getTable("TableName"); - report.addRow("foo.1", "hello"); - report.addRow("foo.2", "world"); - return table; - } - - @Test - public void testDottedSampleName() { - GATKReportTable table = makeBasicTable(); - Assert.assertEquals(table.get(0, "value"), "hello"); - Assert.assertEquals(table.get(1, "value"), "world"); - } - - @Test - public void testSimpleGATKReport() { - // Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome - GATKReport report = GATKReport.newSimpleReport("TableName", "Roger", "is", "Awesome"); - - // Add data to simple GATK report - report.addRow(12, 23.45, true); - report.addRow("ans", '3', 24.5); - report.addRow("hi", "", 2.3); - - // Print the report to console - //report.print(System.out); - - try { - File file = createTempFile("GATKReportGatherer-UnitTest", ".tbl"); - //System.out.format("The temporary file" + " has been created: %s%n", file); - PrintStream ps = new PrintStream(file); - report.print(ps); - //System.out.println("File succesfully outputed!"); - GATKReport inputRead = new GATKReport(file); - //System.out.println("File succesfully read!"); - //inputRead.print(System.out); - Assert.assertTrue(report.isSameFormat(inputRead)); - - } catch (IOException x) { - System.err.format("IOException: %s%n", x); - } - - } - - @Test - public void testGATKReportGatherer() { - - GATKReport report1, report2, report3; - report1 = new GATKReport(); - report1.addTable("TableName", "Description", 2); - report1.getTable("TableName").addColumn("colA", "%s"); - report1.getTable("TableName").addColumn("colB", "%c"); - report1.getTable("TableName").set(0, "colA", "NotNum"); - report1.getTable("TableName").set(0, "colB", (char) 64); - - report2 = new GATKReport(); - report2.addTable("TableName", "Description", 2); - report2.getTable("TableName").addColumn("colA", "%s"); - report2.getTable("TableName").addColumn("colB", "%c"); - report2.getTable("TableName").set(0, "colA", "df3"); - report2.getTable("TableName").set(0, "colB", 'A'); - - report3 = new GATKReport(); - report3.addTable("TableName", "Description", 2); - report3.getTable("TableName").addColumn("colA", "%s"); - report3.getTable("TableName").addColumn("colB", "%c"); - report3.getTable("TableName").set(0, "colA", "df5f"); - report3.getTable("TableName").set(0, "colB", 'c'); - - report1.concat(report2); - report1.concat(report3); - - report1.addTable("Table2", "To contain some more data types", 3); - GATKReportTable table = report1.getTable("Table2"); - table.addColumn("SomeInt", "%d"); - table.addColumn("SomeFloat", "%.16E"); - table.addColumn("TrueFalse", "%B"); - table.addRowIDMapping("12df", 0); - table.addRowIDMapping("5f", 1); - table.addRowIDMapping("RZ", 2); - table.set("12df", "SomeInt", Byte.MAX_VALUE); - table.set("12df", "SomeFloat", 34.0); - table.set("12df", "TrueFalse", true); - table.set("5f", "SomeInt", Short.MAX_VALUE); - table.set("5f", "SomeFloat", Double.MAX_VALUE); - table.set("5f", "TrueFalse", false); - table.set("RZ", "SomeInt", Long.MAX_VALUE); - table.set("RZ", "SomeFloat", 535646345.657453464576); - table.set("RZ", "TrueFalse", true); - - report1.addTable("Table3", "blah", 1, GATKReportTable.TableSortingWay.SORT_BY_ROW); - report1.getTable("Table3").addColumn("a"); - report1.getTable("Table3").addRowIDMapping("q", 2); - report1.getTable("Table3").addRowIDMapping("5", 3); - report1.getTable("Table3").addRowIDMapping("573s", 0); - report1.getTable("Table3").addRowIDMapping("ZZZ", 1); - report1.getTable("Table3").set("q", "a", "34"); - report1.getTable("Table3").set("5", "a", "c4g34"); - report1.getTable("Table3").set("573s", "a", "fDlwueg"); - report1.getTable("Table3").set("ZZZ", "a", "Dfs"); - - try { - File file = createTempFile("GATKReportGatherer-UnitTest", ".tbl"); - //System.out.format("The temporary file" + " has been created: %s%n", file); - PrintStream ps = new PrintStream(file); - report1.print(ps); - //System.out.println("File succesfully outputed!"); - GATKReport inputRead = new GATKReport(file); - //System.out.println("File succesfully read!"); - //inputRead.print(System.out); - Assert.assertTrue(report1.isSameFormat(inputRead)); - Assert.assertTrue(report1.equals(inputRead)); - - } catch (IOException x) { - System.err.format("IOException: %s%n", x); - } - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java deleted file mode 100644 index e1d81b587..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/DummyActiveRegionWalker.java +++ /dev/null @@ -1,116 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.ActiveRegionWalker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.gatk.utils.activeregion.ActivityProfileState; - -import java.util.*; - -/** - * ActiveRegionWalker for unit testing - * - * User: depristo - * Date: 1/15/13 - * Time: 1:28 PM - */ -class DummyActiveRegionWalker extends ActiveRegionWalker { - private final double prob; - private EnumSet states = super.desiredReadStates(); - private GenomeLocSortedSet activeRegions = null; - - protected List isActiveCalls = new ArrayList(); - protected Map mappedActiveRegions = new LinkedHashMap(); - private boolean declareHavingPresetRegions = false; - - public DummyActiveRegionWalker() { - this(1.0); - } - - public DummyActiveRegionWalker(double constProb) { - this.prob = constProb; - } - - public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, EnumSet wantStates, final boolean declareHavingPresetRegions) { - this(activeRegions, declareHavingPresetRegions); - this.states = wantStates; - } - - public DummyActiveRegionWalker(GenomeLocSortedSet activeRegions, final boolean declareHavingPresetRegions) { - this(1.0); - this.activeRegions = activeRegions; - this.declareHavingPresetRegions = declareHavingPresetRegions; - } - - public void setStates(EnumSet states) { - this.states = states; - } - - @Override - public boolean hasPresetActiveRegions() { - return declareHavingPresetRegions; - } - - @Override - public GenomeLocSortedSet getPresetActiveRegions() { - return declareHavingPresetRegions ? activeRegions : null; - } - - @Override - public EnumSet desiredReadStates() { - return states; - } - - @Override - public ActivityProfileState isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - isActiveCalls.add(ref.getLocus()); - final double p = activeRegions == null || activeRegions.overlaps(ref.getLocus()) ? prob : 0.0; - return new ActivityProfileState(ref.getLocus(), p); - } - - @Override - public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { - mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); - return 0; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return 0; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java deleted file mode 100644 index 50eb49664..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseActiveRegionsUnitTest.java +++ /dev/null @@ -1,679 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import com.google.java.contract.PreconditionError; -import htsjdk.samtools.*; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.*; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.SampleUtils; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.gatk.utils.interval.IntervalMergingRule; -import org.broadinstitute.gatk.utils.interval.IntervalUtils; -import org.broadinstitute.gatk.utils.sam.*; -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.executive.WindowMaker; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.activeregion.ActiveRegion; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - - -import java.io.File; -import java.io.IOException; -import java.util.*; - -/** - * Created with IntelliJ IDEA. - * User: thibault - * Date: 11/13/12 - * Time: 2:47 PM - * - * Test the Active Region Traversal Contract - * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract - */ -public class TraverseActiveRegionsUnitTest extends BaseTest { - private final static boolean ENFORCE_CONTRACTS = false; - private final static boolean DEBUG = false; - - @DataProvider(name = "TraversalEngineProvider") - public Object[][] makeTraversals() { - final List traversals = new LinkedList(); - traversals.add(new Object[]{new TraverseActiveRegions<>()}); - return traversals.toArray(new Object[][]{}); - } - - private IndexedFastaSequenceFile reference; - private SAMSequenceDictionary dictionary; - private GenomeLocParser genomeLocParser; - - private List intervals; - - private File testBAM; - - @BeforeClass - private void init() throws IOException { - //reference = new CachingIndexedFastaSequenceFile(new File("/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta")); // hg19Reference)); - reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); - dictionary = reference.getSequenceDictionary(); - genomeLocParser = new GenomeLocParser(dictionary); - - // TODO: reads with indels - // TODO: reads which span many regions - // TODO: reads which are partially between intervals (in/outside extension) - // TODO: duplicate reads - // TODO: read at the end of a contig - // TODO: reads which are completely outside intervals but within extension - // TODO: test the extension itself - // TODO: unmapped reads - - intervals = new ArrayList(); - intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); - intervals.add(genomeLocParser.createGenomeLoc("2", 1, 100)); - intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); - - List reads = new ArrayList(); - reads.add(buildSAMRecord("simple", "1", 100, 200)); - reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); - reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); - reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); - reads.add(buildSAMRecord("boundary_unequal", "1", 1990, 2008)); - reads.add(buildSAMRecord("boundary_1_pre", "1", 1950, 2000)); - reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); - reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); - reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); - reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); - reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); - reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); - reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); - - createBAM(reads); - } - - private void createBAM(List reads) throws IOException { - testBAM = createTempFile("TraverseActiveRegionsUnitTest", ".bam"); - - SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, testBAM); - for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { - out.addAlignment(read); - } - out.close(); - - new File(testBAM.getAbsolutePath().replace(".bam", ".bai")).deleteOnExit(); - new File(testBAM.getAbsolutePath() + ".bai").deleteOnExit(); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") - public void testAllBasesSeen(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - List activeIntervals = getIsActiveIntervals(t, walker, intervals); - // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call - verifyEqualIntervals(intervals, activeIntervals); - } - - private List getIsActiveIntervals(final TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { - List activeIntervals = new ArrayList(); - for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, testBAM)) { - t.traverse(walker, dataProvider, 0); - activeIntervals.addAll(walker.isActiveCalls); - } - - return activeIntervals; - } - - @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class) - public void testIsActiveRangeLow (TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(-0.1); - getActiveRegions(t, walker, intervals).values(); - } - - @Test (enabled = ENFORCE_CONTRACTS, dataProvider = "TraversalEngineProvider", expectedExceptions = PreconditionError.class) - public void testIsActiveRangeHigh (TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(1.1); - getActiveRegions(t, walker, intervals).values(); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") - public void testActiveRegionCoverage(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), true); - - Collection activeRegions = getActiveRegions(t, walker, intervals).values(); - verifyActiveRegionCoverage(intervals, activeRegions); - } - - private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { - List intervalStarts = new ArrayList(); - List intervalStops = new ArrayList(); - - for (GenomeLoc interval : intervals) { - intervalStarts.add(interval.getStartLocation()); - intervalStops.add(interval.getStopLocation()); - } - - Map baseRegionMap = new HashMap(); - - for (ActiveRegion activeRegion : activeRegions) { - for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { - // Contract: Regions do not overlap - Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); - baseRegionMap.put(activeLoc, activeRegion); - } - - GenomeLoc start = activeRegion.getLocation().getStartLocation(); - if (intervalStarts.contains(start)) - intervalStarts.remove(start); - - GenomeLoc stop = activeRegion.getLocation().getStopLocation(); - if (intervalStops.contains(stop)) - intervalStops.remove(stop); - } - - for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { - // Contract: Each location in the interval(s) is in exactly one region - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); - baseRegionMap.remove(baseLoc); - } - - // Contract: The total set of regions exactly matches the analysis interval(s) - Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); - - // Contract: All explicit interval boundaries must also be region boundaries - Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); - Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") - public void testActiveRegionExtensionOnContig(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); - - Collection activeRegions = getActiveRegions(t, walker, intervals).values(); - for (ActiveRegion activeRegion : activeRegions) { - GenomeLoc loc = activeRegion.getExtendedLoc(); - - // Contract: active region extensions must stay on the contig - Assert.assertTrue(loc.getStart() > 0, "Active region extension begins at location " + loc.getStart() + ", past the left end of the contig"); - int refLen = dictionary.getSequence(loc.getContigIndex()).getSequenceLength(); - Assert.assertTrue(loc.getStop() <= refLen, "Active region extension ends at location " + loc.getStop() + ", past the right end of the contig"); - } - } - - @Test(enabled = true && !DEBUG, dataProvider = "TraversalEngineProvider") - public void testPrimaryReadMapping(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), - EnumSet.of(ActiveRegionReadState.PRIMARY), - true); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(t, walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_unequal", "boundary_1_pre", "boundary_equal", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") - public void testNonPrimaryReadMapping(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), - true); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // extended_and_np: Primary in 1:1-999, Non-Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_equal: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(t, walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") - public void testExtendedReadMapping(TraverseActiveRegions t) { - DummyActiveRegionWalker walker = new DummyActiveRegionWalker(new GenomeLocSortedSet(genomeLocParser, intervals), - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED), - true); - - // Contract: Each read has the Primary state in a single region (or none) - // This is the region of maximum overlap for the read (earlier if tied) - - // Contract: Each read has the Non-Primary state in all other regions it overlaps - // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended - - // simple: Primary in 1:1-999 - // overlap_equal: Primary in 1:1-999 - // overlap_unequal: Primary in 1:1-999 - // boundary_equal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // boundary_unequal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_pre: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 - // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 - // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 - // outside_intervals: none - // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 - // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 - // simple20: Primary in 20:10000-10100 - - Map activeRegions = getActiveRegions(t, walker, intervals); - ActiveRegion region; - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); - verifyReadMapping(region, "simple", "overlap_equal", "overlap_unequal", "extended_and_np"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); - verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 10000, 20000)); - verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); - - region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); - verifyReadMapping(region, "simple20"); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "TraversalEngineProvider") - public void testUnmappedReads(TraverseActiveRegions t) { - // TODO - } - - private void verifyReadMapping(ActiveRegion region, String... reads) { - Assert.assertNotNull(region, "Region was unexpectedly null"); - final Set regionReads = new HashSet(); - for (SAMRecord read : region.getReads()) { - Assert.assertFalse(regionReads.contains(read.getReadName()), "Duplicate reads detected in region " + region + " read " + read.getReadName()); - regionReads.add(read.getReadName()); - } - - Collection wantReads = new ArrayList(Arrays.asList(reads)); - for (SAMRecord read : region.getReads()) { - String regionReadName = read.getReadName(); - Assert.assertTrue(wantReads.contains(regionReadName), "Read " + regionReadName + " incorrectly assigned to active region " + region); - wantReads.remove(regionReadName); - } - - Assert.assertTrue(wantReads.isEmpty(), "Reads missing in active region " + region + ", wanted " + (wantReads.isEmpty() ? "" : wantReads.iterator().next())); - } - - private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals) { - return getActiveRegions(t, walker, intervals, testBAM); - } - - private Map getActiveRegions(TraverseActiveRegions t, DummyActiveRegionWalker walker, List intervals, final File bam) { - for (LocusShardDataProvider dataProvider : createDataProviders(t, walker, intervals, bam)) - t.traverse(walker, dataProvider, 0); - - return walker.mappedActiveRegions; - } - - private Collection toSingleBaseLocs(GenomeLoc interval) { - List bases = new ArrayList(); - if (interval.size() == 1) - bases.add(interval); - else { - for (int location = interval.getStart(); location <= interval.getStop(); location++) - bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); - } - - return bases; - } - - private Collection toSingleBaseLocs(List intervals) { - Set bases = new TreeSet(); // for sorting and uniqueness - for (GenomeLoc interval : intervals) - bases.addAll(toSingleBaseLocs(interval)); - - return bases; - } - - private void verifyEqualIntervals(List aIntervals, List bIntervals) { - Collection aBases = toSingleBaseLocs(aIntervals); - Collection bBases = toSingleBaseLocs(bIntervals); - - Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); - - Iterator aIter = aBases.iterator(); - Iterator bIter = bBases.iterator(); - while (aIter.hasNext() && bIter.hasNext()) { - GenomeLoc aLoc = aIter.next(); - GenomeLoc bLoc = bIter.next(); - Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); - } - } - - // copied from LocusViewTemplate - protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { - SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); - header.setSequenceDictionary(dictionary); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - GATKSAMRecord record = new GATKSAMRecord(header); - - record.setReadName(readName); - record.setReferenceIndex(dictionary.getSequenceIndex(contig)); - record.setAlignmentStart(alignmentStart); - - Cigar cigar = new Cigar(); - int len = alignmentEnd - alignmentStart + 1; - cigar.add(new CigarElement(len, CigarOperator.M)); - record.setCigar(cigar); - record.setReadString(new String(new char[len]).replace("\0", "A")); - record.setBaseQualities(new byte[len]); - record.setReadGroup(new GATKSAMReadGroupRecord(header.getReadGroup("test"))); - - return record; - } - - private List createDataProviders(TraverseActiveRegions traverseActiveRegions, final Walker walker, List intervals, File bamFile) { - GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - - Collection samFiles = new ArrayList(); - SAMReaderID readerID = new SAMReaderID(bamFile, new Tags()); - samFiles.add(readerID); - - SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser, - false, - ValidationStringency.STRICT, - null, - null, - new ValidationExclusion(), - new ArrayList(), - new ArrayList(), - false, (byte)30, false, true, null, IntervalMergingRule.ALL); - - engine.setReadsDataSource(dataSource); - final Set samples = SampleUtils.getSAMFileSamples(dataSource.getHeader()); - - traverseActiveRegions.initialize(engine, walker); - List providers = new ArrayList(); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs(), samples)) { - providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); - } - } - - return providers; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Combinatorial tests to ensure reads are going into the right regions - // - // --------------------------------------------------------------------------------------------------------- - - @DataProvider(name = "CombinatorialARTTilingProvider") - public Object[][] makeCombinatorialARTTilingProvider() { - final List tests = new LinkedList(); - - final List starts = Arrays.asList( - 1, // very start of the chromosome - ArtificialBAMBuilder.BAM_SHARD_SIZE - 100, // right before the shard boundary - ArtificialBAMBuilder.BAM_SHARD_SIZE + 100 // right after the shard boundary - ); - - final List> allReadStates = Arrays.asList( - EnumSet.of(ActiveRegionReadState.PRIMARY), - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY), - EnumSet.of(ActiveRegionReadState.PRIMARY, ActiveRegionReadState.NONPRIMARY, ActiveRegionReadState.EXTENDED) - ); - - final int maxTests = Integer.MAX_VALUE; - int nTests = 0; - for ( final int readLength : Arrays.asList(100) ) { - for ( final int skips : Arrays.asList(0, 10) ) { - for ( final int start : starts ) { - for ( final int nReadsPerLocus : Arrays.asList(1, 2) ) { - for ( final int nLoci : Arrays.asList(1, 1000) ) { - final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); - bamBuilder.setReadLength(readLength); - bamBuilder.setSkipNLoci(skips); - bamBuilder.setAlignmentStart(start); - for ( EnumSet readStates : allReadStates ) { - for ( final GenomeLocSortedSet activeRegions : enumerateActiveRegions(bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd())) { - nTests++; - if ( nTests < maxTests ) // && nTests == 1238 ) - tests.add(new Object[]{new TraverseActiveRegions<>(), nTests, activeRegions, readStates, bamBuilder}); - } - } - } - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - private Collection enumerateActiveRegions(final int start, final int stop) { - // should basically cut up entire region into equal sized chunks, of - // size 10, 20, 50, 100, etc, alternating skipping pieces so they are inactive - // Need to make sure we include some edge cases: - final List activeRegions = new LinkedList(); - - for ( final int stepSize : Arrays.asList(11, 29, 53, 97) ) { - for ( final boolean startWithActive : Arrays.asList(true, false) ) { - activeRegions.add(makeActiveRegionMask(start, stop, stepSize, startWithActive)); - } - } - - // active region is the whole interval - activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start, stop))); - - // active region extends up to the end of the data, but doesn't include start - activeRegions.add(new GenomeLocSortedSet(genomeLocParser, genomeLocParser.createGenomeLoc("1", start+10, stop))); - - return activeRegions; - } - - private GenomeLocSortedSet makeActiveRegionMask(final int start, final int stop, final int stepSize, final boolean startWithActive) { - final GenomeLocSortedSet active = new GenomeLocSortedSet(genomeLocParser); - - boolean includeRegion = startWithActive; - for ( int left = start; left < stop; left += stepSize) { - final int right = left + stepSize; - final GenomeLoc region = genomeLocParser.createGenomeLoc("1", left, right); - if ( includeRegion ) - active.add(region); - includeRegion = ! includeRegion; - } - - return active; - } - - - @Test(enabled = true && ! DEBUG, dataProvider = "CombinatorialARTTilingProvider") - public void testARTReadsInActiveRegions(final TraverseActiveRegions traversal, final int id, final GenomeLocSortedSet activeRegions, final EnumSet readStates, final ArtificialBAMBuilder bamBuilder) { - logger.warn("Running testARTReadsInActiveRegions id=" + id + " locs " + activeRegions + " against bam " + bamBuilder); - final List intervals = Arrays.asList( - genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) - ); - - final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); - walker.setStates(readStates); - - final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); - - final Set alreadySeenReads = new HashSet(); // for use with the primary / non-primary - for ( final ActiveRegion region : activeRegionsMap.values() ) { - final Set readNamesInRegion = readNamesInRegion(region); - int nReadsExpectedInRegion = 0; - for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { - final GenomeLoc readLoc = genomeLocParser.createGenomeLoc(read); - - boolean shouldBeInRegion = readStates.contains(ActiveRegionReadState.EXTENDED) - ? region.getExtendedLoc().overlapsP(readLoc) - : region.getLocation().overlapsP(readLoc); - - if ( ! readStates.contains(ActiveRegionReadState.NONPRIMARY) ) { - if ( alreadySeenReads.contains(read.getReadName()) ) - shouldBeInRegion = false; - else if ( shouldBeInRegion ) - alreadySeenReads.add(read.getReadName()); - } - - String msg = readNamesInRegion.contains(read.getReadName()) == shouldBeInRegion ? "" : "Region " + region + - " failed contains read check: read " + read + " with span " + readLoc + " should be in region is " + shouldBeInRegion + " but I got the opposite"; - Assert.assertEquals(readNamesInRegion.contains(read.getReadName()), shouldBeInRegion, msg); - - nReadsExpectedInRegion += shouldBeInRegion ? 1 : 0; - } - - Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); - } - } - - private Set readNamesInRegion(final ActiveRegion region) { - final Set readNames = new LinkedHashSet(region.getReads().size()); - for ( final SAMRecord read : region.getReads() ) - readNames.add(read.getReadName()); - return readNames; - } - - // --------------------------------------------------------------------------------------------------------- - // - // Make sure all insertion reads are properly included in the active regions - // - // --------------------------------------------------------------------------------------------------------- - - @Test(dataProvider = "TraversalEngineProvider", enabled = true && ! DEBUG) - public void ensureAllInsertionReadsAreInActiveRegions(final TraverseActiveRegions traversal) { - - final int readLength = 10; - final int start = 20; - final int nReadsPerLocus = 10; - final int nLoci = 3; - - final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(reference, nReadsPerLocus, nLoci); - bamBuilder.setReadLength(readLength); - bamBuilder.setAlignmentStart(start); - - // note that the position must be +1 as the read's all I cigar puts the end 1 bp before start, leaving it out of the region - GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(bamBuilder.getHeader(),"allI",0,start+1,readLength); - allI.setCigarString(readLength + "I"); - allI.setReadGroup(new GATKSAMReadGroupRecord(bamBuilder.getHeader().getReadGroups().get(0))); - - bamBuilder.addReads(allI); - - final GenomeLocSortedSet activeRegions = new GenomeLocSortedSet(bamBuilder.getGenomeLocParser()); - activeRegions.add(bamBuilder.getGenomeLocParser().createGenomeLoc("1", 10, 30)); - final List intervals = Arrays.asList( - genomeLocParser.createGenomeLoc("1", bamBuilder.getAlignmentStart(), bamBuilder.getAlignmentEnd()) - ); - - final DummyActiveRegionWalker walker = new DummyActiveRegionWalker(activeRegions, false); - - final Map activeRegionsMap = getActiveRegions(traversal, walker, intervals, bamBuilder.makeTemporarilyBAMFile()); - - final ActiveRegion region = activeRegionsMap.values().iterator().next(); - int nReadsExpectedInRegion = 0; - - final Set readNamesInRegion = readNamesInRegion(region); - for ( final GATKSAMRecord read : bamBuilder.makeReads() ) { - Assert.assertTrue(readNamesInRegion.contains(read.getReadName()), - "Region " + region + " should contain read " + read + " with cigar " + read.getCigarString() + " but it wasn't"); - nReadsExpectedInRegion++; - } - - Assert.assertEquals(region.size(), nReadsExpectedInRegion, "There are more reads in active region " + region + "than expected"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java deleted file mode 100644 index 70336a26d..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/traversals/TraverseReadsUnitTest.java +++ /dev/null @@ -1,166 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.traversals; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.gatk.engine.datasources.reads.*; -import org.broadinstitute.gatk.engine.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.gatk.engine.resourcemanagement.ThreadAllocation; -import org.broadinstitute.gatk.engine.walkers.ReadWalker; -import org.broadinstitute.gatk.tools.walkers.qc.CountReads; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import static org.testng.Assert.fail; - -/** - * - * User: aaron - * Date: Apr 24, 2009 - * Time: 3:42:16 PM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 24, 2009 - *

- * Class TraverseReadsUnitTest - *

- * test traversing reads - */ -public class TraverseReadsUnitTest extends BaseTest { - - private ReferenceSequenceFile seq; - private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam"); - private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); - private List bamList; - private ReadWalker countReadWalker; - private File output; - private TraverseReadsNano traversalEngine = null; - - private IndexedFastaSequenceFile ref = null; - private GenomeLocParser genomeLocParser = null; - private GenomeAnalysisEngine engine = null; - - @BeforeClass - public void doOnce() { - try { - ref = new CachingIndexedFastaSequenceFile(refFile); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(refFile,ex); - } - genomeLocParser = new GenomeLocParser(ref); - - engine = new GenomeAnalysisEngine(); - engine.setReferenceDataSource(refFile); - engine.setGenomeLocParser(genomeLocParser); - } - - /** - * This function does the setup of our parser, before each method call. - *

- * Called before every test case method. - */ - @BeforeMethod - public void doForEachTest() { - output = new File("testOut.txt"); - FileOutputStream out = null; - PrintStream ps; // declare a print stream object - - try { - out = new FileOutputStream(output); - } catch (FileNotFoundException e) { - e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. - fail("Couldn't open the output file"); - } - - bamList = new ArrayList(); - bamList.add(bam); - countReadWalker = new CountReads(); - - traversalEngine = new TraverseReadsNano(1); - traversalEngine.initialize(engine, countReadWalker); - } - - /** Test out that we can shard the file and iterate over every read */ - @Test - public void testUnmappedReadCount() { - SAMDataSource dataSource = new SAMDataSource(bamList,new ThreadAllocation(),null,genomeLocParser); - Iterable shardStrategy = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); - - countReadWalker.initialize(); - Object accumulator = countReadWalker.reduceInit(); - - for(Shard shard: shardStrategy) { - if (shard == null) { - fail("Shard == null"); - } - - ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null, Collections.emptyList()); - accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); - dataProvider.close(); - } - - countReadWalker.onTraversalDone(accumulator); - - if (!(accumulator instanceof Long)) { - fail("Count read walker should return a Long."); - } - if (!accumulator.equals(new Long(10000))) { - fail("there should be 10000 mapped reads in the index file, there was " + (accumulator)); - } - } - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java deleted file mode 100644 index a4c896e74..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/engine/walkers/WalkerTest.java +++ /dev/null @@ -1,455 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.engine.walkers; - -import htsjdk.tribble.Tribble; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.index.IndexFactory; -import htsjdk.variant.bcf2.BCF2Utils; -import htsjdk.variant.vcf.VCFCodec; -import org.apache.commons.lang.StringUtils; -import org.broadinstitute.gatk.engine.CommandLineExecutable; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.MD5DB; -import org.broadinstitute.gatk.utils.MD5Mismatch; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.classloader.JVMUtils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.exceptions.GATKException; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.testng.Assert; -import org.testng.annotations.AfterSuite; -import org.testng.annotations.BeforeMethod; - -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.PrintStream; -import java.text.SimpleDateFormat; -import java.util.*; - -public class WalkerTest extends BaseTest { - private static final boolean GENERATE_SHADOW_BCF = true; - private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; - private static final boolean ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX = false; - private static final boolean ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS = false; - - private static MD5DB md5DB = new MD5DB(); - - @BeforeMethod - public void initializeWalkerTests() { - logger.debug("Initializing walker tests"); - GenomeAnalysisEngine.resetRandomGenerator(); - } - - @AfterSuite - public void finalizeWalkerTests() { - logger.debug("Finalizing walker tests"); - md5DB.close(); - } - - public static MD5DB getMd5DB() { - return md5DB; - } - - public void validateOutputBCFIfPossible(final String name, final File resultFile) { - final File bcfFile = BCF2Utils.shadowBCF(resultFile); - if ( bcfFile != null && bcfFile.exists() ) { - logger.warn("Checking shadow BCF output file " + bcfFile + " against VCF file " + resultFile); - try { - assertVCFandBCFFilesAreTheSame(resultFile, bcfFile); - logger.warn(" Shadow BCF PASSED!"); - } catch ( Exception e ) { - Assert.fail("Exception received reading shadow BCFFile " + bcfFile + " for test " + name, e); - } - } - } - - public void validateOutputIndex(final String name, final File resultFile) { - if ( !ENABLE_ON_THE_FLY_CHECK_FOR_VCF_INDEX ) - return; - - File indexFile = Tribble.indexFile(resultFile); - //System.out.println("Putative index file is " + indexFile); - if ( indexFile.exists() ) { - if ( resultFile.getAbsolutePath().contains(".vcf") ) { - // todo -- currently we only understand VCF files! Blow up since we can't test them - throw new GATKException("Found an index created for file " + resultFile + " but we can only validate VCF files. Extend this code!"); - } - - System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile); - Index indexFromOutputFile = IndexFactory.createDynamicIndex(resultFile, new VCFCodec()); - Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - - if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) { - Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", - indexFromOutputFile.getProperties(), - dynamicIndex.getProperties())); - } - } - } - - public List assertMatchingMD5s(final String testName, final String testClassName, List resultFiles, List expectedMD5s) { - List md5s = new ArrayList(); - List fails = new ArrayList(); - - for (int i = 0; i < resultFiles.size(); i++) { - MD5DB.MD5Match result = getMd5DB().testFileMD5(testName, testClassName, resultFiles.get(i), expectedMD5s.get(i), parameterize()); - validateOutputBCFIfPossible(testName, resultFiles.get(i)); - if ( ! result.failed ) { - validateOutputIndex(testName, resultFiles.get(i)); - md5s.add(result.expectedMD5); - } else { - fails.add(result); - } - } - - if ( ! fails.isEmpty() ) { - List actuals = new ArrayList(); - List expecteds = new ArrayList(); - List diffEngineOutputs = new ArrayList(); - - for ( final MD5DB.MD5Match fail : fails ) { - actuals.add(fail.actualMD5); - expecteds.add(fail.expectedMD5); - diffEngineOutputs.add(fail.diffEngineOutput); - logger.warn("Fail: " + fail.failMessage); - } - - final MD5Mismatch failure = new MD5Mismatch(actuals, expecteds, diffEngineOutputs); - Assert.fail(failure.toString()); - } - - return md5s; - } - - public String buildCommandLine(String... arguments) { - String cmdline = ""; - - for ( int argIndex = 0; argIndex < arguments.length; argIndex++ ) { - cmdline += arguments[argIndex]; - - if (argIndex < arguments.length - 1) { - cmdline += " "; - } - } - - return cmdline; - } - - public class WalkerTestSpec { - // Arguments implicitly included in all Walker command lines, unless explicitly - // disabled using the disableImplicitArgs() method below. - String args = ""; - int nOutputFiles = -1; - List md5s = null; - List exts = null; - Class expectedException = null; - boolean includeImplicitArgs = true; - boolean includeShadowBCF = true; - - // Name of the test class that created this test case - private Class testClass; - - // the default output path for the integration test - private File outputFileLocation = null; - - protected Map auxillaryFiles = new HashMap(); - - public WalkerTestSpec(String args, List md5s) { - this(args, -1, md5s); - } - - public WalkerTestSpec(String args, int nOutputFiles, List md5s) { - this.args = args; - this.nOutputFiles = md5s.size(); - this.md5s = md5s; - this.testClass = getCallingTestClass(); - } - - public WalkerTestSpec(String args, List exts, List md5s) { - this(args, -1, exts, md5s); - } - - public WalkerTestSpec(String args, int nOutputFiles, List exts, List md5s) { - this.args = args; - this.nOutputFiles = md5s.size(); - this.md5s = md5s; - this.exts = exts; - this.testClass = getCallingTestClass(); - } - - // @Test(expectedExceptions) doesn't work in integration tests, so use this instead - public WalkerTestSpec(String args, int nOutputFiles, Class expectedException) { - this.args = args; - this.nOutputFiles = nOutputFiles; - this.expectedException = expectedException; - this.testClass = getCallingTestClass(); - } - - private Class getCallingTestClass() { - return JVMUtils.getCallingClass(getClass()); - } - - public String getTestClassName() { - return testClass.getSimpleName(); - } - - public String getArgsWithImplicitArgs() { - String args = this.args; - if ( includeImplicitArgs ) { - args = args + (ENABLE_PHONE_HOME_FOR_TESTS ? - String.format(" -et %s ", GATKRunReport.PhoneHomeOption.AWS) : - String.format(" -et %s -K %s ", GATKRunReport.PhoneHomeOption.NO_ET, gatkKeyFile)); - if ( includeShadowBCF && GENERATE_SHADOW_BCF ) - args = args + " --generateShadowBCF "; - if ( ! ENABLE_AUTO_INDEX_CREATION_AND_LOCKING_FOR_TESTS ) - args = args + " --disable_auto_index_creation_and_locking_when_reading_rods "; - } - - return args; - } - - /** - * In the case where the input VCF files are malformed and cannot be fixed - * this function tells the engine to not try to generate a shadow BCF - * which will ultimately blow up... - */ - public void disableShadowBCF() { this.includeShadowBCF = false; } - public void setOutputFileLocation(File outputFileLocation) { - this.outputFileLocation = outputFileLocation; - } - - protected File getOutputFileLocation() { - return outputFileLocation; - } - - public boolean expectsException() { - return expectedException != null; - } - - public Class getExpectedException() { - if ( ! expectsException() ) throw new ReviewedGATKException("Tried to get expection for walker test that doesn't expect one"); - return expectedException; - } - - public void addAuxFile(String expectededMD5sum, File outputfile) { - auxillaryFiles.put(expectededMD5sum, outputfile); - } - - public void disableImplicitArgs() { - includeImplicitArgs = false; - } - } - - protected boolean parameterize() { - return false; - } - - public enum ParallelTestType { - TREE_REDUCIBLE, - NANO_SCHEDULED, - BOTH - } - - protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) { - final List ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); - final List cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); - - return executeTest(name, spec, ntThreads, cntThreads); - } - - protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { - return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE); - } - - protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { - String originalArgs = spec.args; - Pair, List> results = null; - - boolean ran1 = false; - for ( int nt : ntThreads ) { - String extra = nt == 1 ? "" : (" -nt " + nt); - ran1 = ran1 || nt == 1; - spec.args = originalArgs + extra; - results = executeTest(name + "-nt-" + nt, spec); - } - - for ( int nct : cpuThreads ) { - if ( nct != 1 ) { - String extra = " -nct " + nct; - spec.args = originalArgs + extra; - results = executeTest(name + "-cnt-" + nct, spec); - } - } - - return results; - } - - protected Pair, List> executeTest(final String name, WalkerTestSpec spec) { - List tmpFiles = new ArrayList(); - for (int i = 0; i < spec.nOutputFiles; i++) { - String ext = spec.exts == null ? ".tmp" : "." + spec.exts.get(i); - File fl = createTempFile(String.format("walktest.tmp_param.%d", i), ext); - - // Cleanup any potential shadow BCFs on exit too, if we're generating them - if ( spec.includeShadowBCF && GENERATE_SHADOW_BCF ) { - final File potentalShadowBCFFile = BCF2Utils.shadowBCF(fl); - potentalShadowBCFFile.deleteOnExit(); - new File(potentalShadowBCFFile.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION).deleteOnExit(); - } - - tmpFiles.add(fl); - } - - final String args = String.format(spec.getArgsWithImplicitArgs(), tmpFiles.toArray()); - System.out.println(Utils.dupString('-', 80)); - - if ( spec.expectsException() ) { - // this branch handles the case were we are testing that a walker will fail as expected - return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), null, tmpFiles, args, spec.getExpectedException()); - } else { - List md5s = new LinkedList(); - md5s.addAll(spec.md5s); - - // check to see if they included any auxillary files, if so add them to the list and set them to be deleted on exit - for (String md5 : spec.auxillaryFiles.keySet()) { - md5s.add(md5); - final File auxFile = spec.auxillaryFiles.get(md5); - auxFile.deleteOnExit(); - tmpFiles.add(auxFile); - } - return executeTest(name, spec.getTestClassName(), spec.getOutputFileLocation(), md5s, tmpFiles, args, null); - } - } - - private void qcMD5s(String name, List md5s) { - final String exampleMD5 = "709a1f482cce68992c637da3cff824a8"; - for (String md5 : md5s) { - if ( md5 == null ) - throw new IllegalArgumentException("Null MD5 found in test " + name); - if ( md5.equals("") ) // ok - continue; - if ( ! StringUtils.isAlphanumeric(md5) ) - throw new IllegalArgumentException("MD5 contains non-alphanumeric characters test " + name + " md5=" + md5); - if ( md5.length() != exampleMD5.length() ) - throw new IllegalArgumentException("Non-empty MD5 of unexpected number of characters test " + name + " md5=" + md5); - } - } - - - /** - * execute the test, given the following: - * @param testName the name of the test - * @param testClassName the name of the class that contains the test - * @param md5s the list of md5s - * @param tmpFiles the temp file corresponding to the md5 list - * @param args the argument list - * @param expectedException the expected exception or null - * @return a pair of file and string lists - */ - private Pair, List> executeTest(String testName, String testClassName, File outputFileLocation, List md5s, List tmpFiles, String args, Class expectedException) { - if ( md5s != null ) qcMD5s(testName, md5s); - - if (outputFileLocation != null) - args += " -o " + outputFileLocation.getAbsolutePath(); - executeTest(testName, testClassName, args, expectedException); - - if ( expectedException != null ) { - return null; - } else { - // we need to check MD5s - return new Pair, List>(tmpFiles, assertMatchingMD5s(testName, testClassName, tmpFiles, md5s)); - } - } - - /** - * execute the test, given the following: - * @param testName the name of the test - * @param testClassName the name of the class that contains the test - * @param args the argument list - * @param expectedException the expected exception or null - */ - private void executeTest(String testName, String testClassName, String args, Class expectedException) { - CommandLineGATK instance = new CommandLineGATK(); - String[] command = Utils.escapeExpressions(args); - // run the executable - boolean gotAnException = false; - try { - final String now = new SimpleDateFormat("HH:mm:ss").format(new Date()); - final String cmdline = Utils.join(" ",command); - System.out.println(String.format("[%s] Executing test %s:%s with GATK arguments: %s", now, testClassName, testName, cmdline)); - // also write the command line to the HTML log for convenient follow-up - // do the replaceAll so paths become relative to the current - BaseTest.log(cmdline.replaceAll(publicTestDirRoot, "").replaceAll(privateTestDirRoot, "")); - CommandLineExecutable.start(instance, command); - } catch (Exception e) { - gotAnException = true; - if ( expectedException != null ) { - // we expect an exception - //System.out.println(String.format("Wanted exception %s, saw %s", expectedException, e.getClass())); - if ( expectedException.isInstance(e) ) { - // it's the type we expected - //System.out.println(String.format(" => %s PASSED", name)); - } else { - final String message = String.format("Test %s:%s expected exception %s but instead got %s with error message %s", - testClassName, testName, expectedException, e.getClass(), e.getMessage()); - if ( e.getCause() != null ) { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final PrintStream ps = new PrintStream(baos); - e.getCause().printStackTrace(ps); - BaseTest.log(message); - BaseTest.log(baos.toString()); - } - Assert.fail(message); - } - } else { - // we didn't expect an exception but we got one :-( - throw new RuntimeException(e); - } - } - - // catch failures from the integration test - if ( expectedException != null ) { - if ( ! gotAnException ) - // we expected an exception but didn't see it - Assert.fail(String.format("Test %s:%s expected exception %s but none was thrown", testClassName, testName, expectedException.toString())); - } else { - if ( CommandLineExecutable.result != 0) { - throw new RuntimeException("Error running the GATK with arguments: " + args); - } - } - } - - - protected File createTempFileFromBase(final String name) { - File fl = new File(name); - fl.deleteOnExit(); - return fl; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java new file mode 100644 index 000000000..108eb102f --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjects.java @@ -0,0 +1,279 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.diffengine; + +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.Input; +import org.broadinstitute.gatk.utils.commandline.Output; +import org.broadinstitute.gatk.engine.CommandLineGATK; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.diffengine.DiffElement; +import org.broadinstitute.gatk.utils.diffengine.DiffEngine; +import org.broadinstitute.gatk.utils.diffengine.Difference; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.engine.walkers.RodWalker; +import org.broadinstitute.gatk.utils.help.DocumentedGATKFeature; +import org.broadinstitute.gatk.utils.help.HelpConstants; + +import java.io.File; +import java.io.PrintStream; +import java.util.List; + +/** + * A generic engine for comparing tree-structured objects + * + *

+ * Compares two record-oriented files, itemizing specific difference between equivalent + * records in the two files. Reports both itemized and summarized differences. + *

+ * + *

What are the summarized differences and the DiffObjectsWalker?

+ * + *

+ * The GATK contains a summarizing difference engine that compares hierarchical data structures to emit: + *

    + *
  • A list of specific differences between the two data structures. This is similar to saying the value in field A in record 1 in file F differences from the value in field A in record 1 in file G. + *
  • A summarized list of differences ordered by frequency of the difference. This output is similar to saying field A in 50 records in files F and G differed. + *
+ *

+ * + *

+ * The GATK contains a private walker DiffObjects that allows you access to the DiffEngine capabilities on the command line. Simply provide the walker with the master and test files and it will emit summarized differences for you. + *

+ * + *

Why?

+ * + *

+ * The reason for this system is that it allows you to compare two structured files -- such as BAMs and VCFs -- for common differences among them. This is primarily useful in regression testing or optimization, where you want to ensure that the differences are those that you expect and not any others. + *

+ * + *

Input

+ *

+ * The DiffObjectsWalker works with BAM or VCF files. + *

+ * + *

Output

+ *

+ * The DiffEngine system compares to two hierarchical data structures for specific differences in the values of named + * nodes. Suppose I have two trees: + *

+ *     Tree1=(A=1 B=(C=2 D=3))
+ *     Tree2=(A=1 B=(C=3 D=3 E=4))
+ *     Tree3=(A=1 B=(C=4 D=3 E=4))
+ * 
+ *

+ * where every node in the tree is named, or is a raw value (here all leaf values are integers). The DiffEngine + * traverses these data structures by name, identifies equivalent nodes by fully qualified names + * (Tree1.A is distinct from Tree2.A, and determines where their values are equal (Tree1.A=1, Tree2.A=1, so they are). + * These itemized differences are listed as: + *

+ *     Tree1.B.C=2 != Tree2.B.C=3
+ *     Tree1.B.C=2 != Tree3.B.C=4
+ *     Tree2.B.C=3 != Tree3.B.C=4
+ *     Tree1.B.E=MISSING != Tree2.B.E=4
+ * 
+ * + *

+ * This conceptually very similar to the output of the unix command line tool diff. What's nice about DiffEngine though + * is that it computes similarity among the itemized differences and displays the count of differences names + * in the system. In the above example, the field C is not equal three times, while the missing E in Tree1 occurs + * only once. So the summary is: + * + *

+ *     *.B.C : 3
+ *     *.B.E : 1
+ * 
+ * + *

+ * where the * operator indicates that any named field matches. This output is sorted by counts, and provides an + * immediate picture of the commonly occurring differences among the files. + *

+ * Below is a detailed example of two VCF fields that differ because of a bug in the AC, AF, and AN counting routines, + * detected by the integrationtest integration (more below). You can see that in the although there are many specific + * instances of these differences between the two files, the summarized differences provide an immediate picture that + * the AC, AF, and AN fields are the major causes of the differences. + *

+ * + *

+ [testng] path                                                             count
+ [testng] *.*.*.AC                                                         6
+ [testng] *.*.*.AF                                                         6
+ [testng] *.*.*.AN                                                         6
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AC  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AF  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000000.AN  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AC  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AF  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000117.AN  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AC  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AF  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000211.AN  1
+ [testng] 64b991fd3850f83614518f7d71f0532f.integrationtest.20:10000598.AC  1
+ 
+ * + *

Caveat

+ *

Because this is a walker, it requires that you pass a reference file. However the reference is not actually used, so it does not matter what you pass as reference.

+ * + * + * @author Mark DePristo + * @since 7/4/11 + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) +public class DiffObjects extends RodWalker { + /** + * Writes out a file of the DiffEngine format: + * + * See http://www.broadinstitute.org/gatk/guide/article?id=1299 for details. + */ + @Output(doc="File to which results should be written") + protected PrintStream out; + + /** + * The master file against which we will compare test. This is one of the two required + * files to do the comparison. Conceptually master is the original file contained the expected + * results, but this doesn't currently have an impact on the calculations, but might in the future. + */ + @Input(fullName="master", shortName="m", doc="Master file: expected results", required=true) + File masterFile; + + /** + * The test file against which we will compare to the master. This is one of the two required + * files to do the comparison. Conceptually test is the derived file from master, but this + * doesn't currently have an impact on the calculations, but might in the future. + */ + @Input(fullName="test", shortName="t", doc="Test file: new results to compare to the master file", required=true) + File testFile; + + /** + * The engine will read at most this number of objects from each of master and test files. This reduces + * the memory requirements for DiffObjects but does limit you to comparing at most this number of objects + */ + @Argument(fullName="maxObjectsToRead", shortName="motr", doc="Max. number of objects to read from the files. -1 [default] means unlimited", required=false) + int MAX_OBJECTS_TO_READ = -1; + + @Argument(fullName="maxRawDiffsToSummarize", shortName="maxRawDiffsToSummarize", doc="Max. number of differences to include in the summary. -1 [default] means unlimited", required=false) + int maxRawDiffsToSummary = -1; + + @Argument(fullName="doPairwise", shortName="doPairwise", doc="If provided, we will compute the minimum pairwise differences to summary, which can be extremely expensive", required=false) + boolean doPairwise = false; + + /** + * The max number of differences to display when summarizing. For example, if there are 10M differences, but + * maxDiffs is 10, then the comparison aborts after first ten summarized differences are shown. Note that + * the system shows differences sorted by frequency, so these 10 would be the most common between the two files. + * A value of 0 means show all possible differences. + */ + @Argument(fullName="maxDiffs", shortName="M", doc="Max. number of diffs to process", required=false) + int MAX_DIFFS = 0; + + /** + * The maximum number of singleton (occurs exactly once between the two files) to display when writing out + * the summary. Only applies if maxDiffs hasn't been exceeded. For example, if maxDiffs is 10 and maxCount1Diffs + * is 2 and there are 20 diffs with count > 1, then only 10 are shown, all of which have count above 1. + */ + @Argument(fullName="maxCount1Diffs", shortName="M1", doc="Max. number of diffs occuring exactly once in the file to process", required=false) + int MAX_COUNT1_DIFFS = 0; + + /** + * Only differences that occur more than minCountForDiff are displayed. For example, if minCountForDiff is 10, then + * a difference must occur at least 10 times between the two files to be shown. + */ + @Argument(fullName="minCountForDiff", shortName="MCFD", doc="Min number of observations for a records to display", required=false) + int minCountForDiff = 1; + + /** + * If provided, the system will write out the summarized, individual differences. May lead to enormous outputs, + * depending on how many differences are found. Note these are not sorted in any way, so if you have 10M + * common differences in the files, you will see 10M records, whereas the final summarize will just list the + * difference and its count of 10M. + */ + @Argument(fullName="showItemizedDifferences", shortName="SID", doc="Should we enumerate all differences between the files?", required=false) + boolean showItemizedDifferences = false; + + @Argument(fullName="iterations", doc="Number of iterations to perform, should be 1 unless you are doing memory testing", required=false) + int iterations = 1; + + DiffEngine diffEngine; + + @Override + public void initialize() { + this.diffEngine = new DiffEngine(); + } + + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer counter, Integer sum) { + return counter + sum; + } + + @Override + public void onTraversalDone(Integer sum) { + if ( iterations > 1 ) { + for ( int i = 0; i < iterations; i++ ) { + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, 20, 10, 0, -1, false); + boolean success = DiffEngine.simpleDiffFiles(masterFile, testFile, MAX_OBJECTS_TO_READ, params); + logger.info("Iteration " + i + " success " + success); + } + } else { + //out.printf("Reading master file %s%n", masterFile); + DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ); + logger.info(String.format("Read %d objects", master.size())); + //out.printf("Reading test file %s%n", testFile); + DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ); + logger.info(String.format("Read %d objects", test.size())); + +// out.printf("Master diff objects%n"); +// out.println(master.toString()); +// out.printf("Test diff objects%n"); +// out.println(test.toString()); + + List diffs = diffEngine.diff(master, test); + logger.info(String.format("Done computing diff with %d differences found", diffs.size())); + if ( showItemizedDifferences ) { + out.printf("Itemized results%n"); + for ( Difference diff : diffs ) + out.printf("DIFF: %s%n", diff.toString()); + } + + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(out, + MAX_DIFFS, MAX_COUNT1_DIFFS, minCountForDiff, + maxRawDiffsToSummary, doPairwise); + params.setDescending(false); + diffEngine.reportSummarizedDifferences(diffs, params); + logger.info(String.format("Done summarizing differences")); + } + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjectsIntegrationTest.java new file mode 100644 index 000000000..feefd5f60 --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -0,0 +1,76 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.diffengine; + +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import org.broadinstitute.gatk.utils.BaseTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; + +public class DiffObjectsIntegrationTest extends WalkerTest { + private class TestParams extends TestDataProvider { + public File master, test; + public String MD5; + public boolean doPairwise; + + private TestParams(String master, String test, final boolean doPairwise, String MD5) { + super(TestParams.class); + this.master = new File(master); + this.test = new File(test); + this.MD5 = MD5; + this.doPairwise = doPairwise; + } + + public String toString() { + return String.format("master=%s,test=%s,md5=%s", master, test, MD5); + } + } + + @DataProvider(name = "data") + public Object[][] createData() { + new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", true, "71869ddf9665773a842a9def4cc5f3c8"); + new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", true, "cec7c644c84ef9c96aacaed604d9ec9b"); + new TestParams(privateTestDir + "diffTestMaster.vcf", privateTestDir + "diffTestTest.vcf", false, "47546e03344103020e49d8037a7e0727"); + new TestParams(publicTestDir + "exampleBAM.bam", publicTestDir + "exampleBAM.simple.bam", false, "d27b37f7a366c8dacca5cd2590d3c6ce"); + return TestParams.getTests(TestParams.class); + } + + @Test(enabled = true, dataProvider = "data") + public void testDiffs(TestParams params) { + WalkerTestSpec spec = new WalkerTestSpec( + "-T DiffObjects -R " + publicTestDir + "exampleFASTA.fasta " + + " -m " + params.master + + " -t " + params.test + + (params.doPairwise ? " -doPairwise " : "") + + " -o %s", + Arrays.asList(params.MD5)); + executeTest("testDiffObjects:"+params, spec).getFirst(); + } +} + diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsUnitTest.java index 53b6d42d6..bb6d5bc1a 100644 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsUnitTest.java +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/readutils/PrintReadsUnitTest.java @@ -28,8 +28,8 @@ package org.broadinstitute.gatk.tools.walkers.readutils; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.utils.sam.ArtificialReadsTraversal; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.engine.traversals.ArtificialReadsTraversal; import org.broadinstitute.gatk.utils.sam.ArtificialGATKSAMFileWriter; import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFIntegrationTest.java new file mode 100644 index 000000000..2928bba1e --- /dev/null +++ b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/VCFIntegrationTest.java @@ -0,0 +1,378 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.tools.walkers.variantutils; + +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.Tribble; +import htsjdk.tribble.index.AbstractIndex; +import htsjdk.tribble.index.ChrIndex; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.IndexFactory; +import htsjdk.tribble.index.interval.IntervalTreeIndex; +import htsjdk.tribble.index.linear.LinearIndex; +import htsjdk.tribble.index.tabix.TabixIndex; +import htsjdk.tribble.util.TabixUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.engine.walkers.WalkerTest; +import htsjdk.variant.vcf.VCFCodec; +import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType; +import org.testng.Assert; +import org.testng.TestException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.lang.reflect.Field; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.List; + +public class VCFIntegrationTest extends WalkerTest { + + @Test(enabled = true) + public void testReadingAndWritingWitHNoChanges() { + + String md5ofInputVCF = "d991abe6c6a7a778a60a667717903be0"; + String testVCF = privateTestDir + "vcf4.1.example.vcf"; + + String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; + + String test1 = baseCommand + "-T VariantAnnotator --variant " + testVCF + " -L " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF)); + List result = executeTest("Test Variant Annotator with no changes", spec1).getFirst(); + + String test2 = baseCommand + "-T VariantsToVCF --variant " + result.get(0).getAbsolutePath(); + WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF)); + executeTest("Test Variants To VCF from new output", spec2); + } + + @Test(enabled = true) + public void testReadingAndWritingBreakpointAlleles() { + String testVCF = privateTestDir + "breakpoint-example.vcf"; + + String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; + + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("13329ba7360a8beb3afc02569e5a20c4")); + executeTest("Test reading and writing breakpoint VCF", spec1); + } + + @Test(enabled = true) + public void testReadingLowerCaseBases() { + String testVCF = privateTestDir + "lowercaseBases.vcf"; + + String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; + + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("e0e308a25e56bde1c664139bb44ed19d")); + executeTest("Test reading VCF with lower-case bases", spec1); + } + + @Test(enabled = true) + public void testReadingAndWriting1000GSVs() { + String testVCF = privateTestDir + "1000G_SVs.chr1.vcf"; + + String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; + + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("bdab26dd7648a806dbab01f64db2bdab")); + executeTest("Test reading and writing 1000G Phase I SVs", spec1); + } + + @Test + public void testReadingAndWritingSamtools() { + String testVCF = privateTestDir + "samtools.vcf"; + + String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; + + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("38697c195e7abf18d95dcc16c8e6d284")); + executeTest("Test reading and writing samtools vcf", spec1); + } + + @Test + public void testWritingSamtoolsWExBCFExample() { + String testVCF = privateTestDir + "ex2.vcf"; + String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("e8f721ce81e4fdadba13c5291027057f")); + executeTest("Test writing samtools WEx BCF example", spec1); + } + + @Test(enabled = true) + public void testReadingSamtoolsWExBCFExample() { + String testVCF = privateTestDir + "ex2.bcf"; + String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; + String test1 = baseCommand + "-T SelectVariants -V " + testVCF; + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("0439e2b4ccc63bb4ba7c283cd9ab1b25")); + executeTest("Test reading samtools WEx BCF example", spec1); + } + + // + // + // Tests to ensure that -U LENIENT_VCF_PROCESS + // + // + + @Test + public void testFailingOnVCFWithoutHeaders() { + runVCFWithoutHeaders("", "", IllegalStateException.class, false); + } + + @Test + public void testPassingOnVCFWithoutHeadersWithLenientProcessing() { + runVCFWithoutHeaders("-U LENIENT_VCF_PROCESSING", "6de8cb7457154dd355aa55befb943f88", null, true); + } + + private void runVCFWithoutHeaders(final String moreArgs, final String expectedMD5, final Class expectedException, final boolean disableBCF) { + final String testVCF = privateTestDir + "vcfexample2.noHeader.vcf"; + final String baseCommand = "-R " + b37KGReference + + " --no_cmdline_in_header -o %s " + + "-T VariantsToVCF -V " + testVCF + " " + moreArgs; + WalkerTestSpec spec1 = expectedException != null + ? new WalkerTestSpec(baseCommand, 1, expectedException) + : new WalkerTestSpec(baseCommand, 1, Arrays.asList(expectedMD5)); + if ( disableBCF ) + spec1.disableShadowBCF(); + executeTest("Test reading VCF without header lines with additional args " + moreArgs, spec1); + } + + // + // + // IndexCreator tests + // + // + + private class VCFIndexCreatorTest extends TestDataProvider { + private final GATKVCFIndexType type; + private final int parameter; + + private VCFIndexCreatorTest(GATKVCFIndexType type, int parameter) { + super(VCFIndexCreatorTest.class); + + this.type = type; + this.parameter = parameter; + } + + public String toString() { + return String.format("Index Type %s, Index Parameter %s", type, parameter); + } + + public Index getIndex(final File vcfFile) { + switch (type) { + case DYNAMIC_SEEK : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + case DYNAMIC_SIZE : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SIZE); + case LINEAR : return IndexFactory.createLinearIndex(vcfFile, new VCFCodec(), parameter); + case INTERVAL : return IndexFactory.createIntervalIndex(vcfFile, new VCFCodec(), parameter); + default : throw new TestException("Invalid index type"); + } + } + } + + @DataProvider(name = "IndexDataProvider") + public Object[][] indexCreatorData() { + new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0); + new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0); + new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 100); + new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 10000); + new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 20); + new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 2000); + + return TestDataProvider.getTests(VCFIndexCreatorTest.class); + } + + @Test(dataProvider = "IndexDataProvider") + public void testVCFIndexCreation(VCFIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException { + + final String commandLine = " -T SelectVariants" + + " -R " + b37KGReference + + " --no_cmdline_in_header" + + " -L 20" + + " -V " + b37_NA12878_OMNI + + " --variant_index_type " + testSpec.type + + " --variant_index_parameter " + testSpec.parameter + + " -o %s "; + final String name = "testVCFIndexCreation: " + testSpec.toString(); + + final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("")); + spec.disableShadowBCF(); + + File outVCF = executeTest(name, spec).first.get(0); + File outIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION); + + final Index actualIndex = IndexFactory.loadIndex(outIdx.getAbsolutePath()); + final Index expectedIndex = testSpec.getIndex(outVCF); + + if (testSpec.type.equals("LINEAR")) + Assert.assertTrue(actualIndex instanceof LinearIndex, "Index is not a LinearIndex"); + else if (testSpec.type.equals("INTERVAL")) + Assert.assertTrue(actualIndex instanceof IntervalTreeIndex, "Index is not a IntervalTreeIndex"); + // dynamic indices ultimately resolve to one of LinearIndex or IntervalTreeIndex + + Assert.assertTrue(equivalentAbstractIndices((AbstractIndex)actualIndex, (AbstractIndex)expectedIndex), "Indices are not equivalent"); + + if (actualIndex instanceof LinearIndex && expectedIndex instanceof LinearIndex) { + Assert.assertTrue(equivalentLinearIndices((LinearIndex)actualIndex, (LinearIndex)expectedIndex, "20"), "Linear indices are not equivalent"); + } + else if (actualIndex instanceof IntervalTreeIndex && expectedIndex instanceof IntervalTreeIndex) { + Assert.assertTrue(equivalentIntervalIndices((IntervalTreeIndex)actualIndex, (IntervalTreeIndex)expectedIndex, "20"), "Interval indices are not equivalent"); + } + else { + Assert.fail("Indices are not of the same type"); + } + } + + private static boolean equivalentAbstractIndices(AbstractIndex thisIndex, AbstractIndex otherIndex){ + return thisIndex.getVersion() == otherIndex.getVersion() && + thisIndex.getIndexedFile().equals(otherIndex.getIndexedFile()) && + thisIndex.getIndexedFileSize() == otherIndex.getIndexedFileSize() && + thisIndex.getIndexedFileMD5().equals(otherIndex.getIndexedFileMD5()) && + thisIndex.getFlags() == otherIndex.getFlags(); + } + + private static boolean equivalentLinearIndices(LinearIndex thisIndex, LinearIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { + htsjdk.tribble.index.linear.LinearIndex.ChrIndex thisChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(thisIndex, chr); + htsjdk.tribble.index.linear.LinearIndex.ChrIndex otherChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(otherIndex, chr); + + return thisChr.getName().equals(otherChr.getName()) && + //thisChr.getTotalSize() == otherChr.getTotalSize() && TODO: why does this differ? + thisChr.getNFeatures() == otherChr.getNFeatures() && + thisChr.getNBlocks() == otherChr.getNBlocks(); + } + + private static boolean equivalentIntervalIndices(IntervalTreeIndex thisIndex, IntervalTreeIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { + htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex thisChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(thisIndex, chr); + htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex otherChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(otherIndex, chr); + + // TODO: compare trees? + return thisChr.getName().equals(otherChr.getName()); + } + + private static ChrIndex getChrIndex(AbstractIndex index, String chr) throws NoSuchFieldException, IllegalAccessException { + Field f = AbstractIndex.class.getDeclaredField("chrIndices"); + f.setAccessible(true); + LinkedHashMap chrIndices = (LinkedHashMap) f.get(index); + return chrIndices.get(chr); + } + + // + // + // Block-Compressed Tabix Index Tests + // + // + + private class BlockCompressedIndexCreatorTest extends TestDataProvider { + private final String extension; + + private BlockCompressedIndexCreatorTest(String extension) { + super(BlockCompressedIndexCreatorTest.class); + + this.extension = extension; + } + + public String toString() { + return String.format("File extension %s", extension); + } + } + + @DataProvider(name = "BlockCompressedIndexDataProvider") + public Object[][] blockCompressedIndexCreatorData() { + for (final String extension : AbstractFeatureReader.BLOCK_COMPRESSED_EXTENSIONS) + new BlockCompressedIndexCreatorTest(".vcf" + extension); + + return TestDataProvider.getTests(BlockCompressedIndexCreatorTest.class); + } + + @Test(dataProvider = "BlockCompressedIndexDataProvider") + public void testBlockCompressedIndexCreation(BlockCompressedIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException { + + final String commandLine = " -T SelectVariants" + + " -R " + b37KGReference + + " --no_cmdline_in_header" + + " -L 20" + + " -V " + b37_NA12878_OMNI; + final String name = "testBlockCompressedIndexCreation: " + testSpec.toString(); + + File outVCF = createTempFile("testBlockCompressedIndexCreation", testSpec.extension); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("")); + spec.disableShadowBCF(); + spec.setOutputFileLocation(outVCF); + + executeTest(name, spec); + + File outTribbleIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION); + Assert.assertFalse(outTribbleIdx.exists(), "testBlockCompressedIndexCreation: Want Tabix index but Tribble index exists: " + outTribbleIdx); + + File outTabixIdx = new File(outVCF.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION); + final Index actualIndex = IndexFactory.loadIndex(outTabixIdx.toString()); + Assert.assertTrue(actualIndex instanceof TabixIndex, "testBlockCompressedIndexCreation: Want Tabix index but index is not Tabix: " + outTabixIdx); + } + + // + // + // Block-Compressed Input Tests + // + // + + private class BlockCompressedInputTest extends TestDataProvider { + private final String extension; + + private BlockCompressedInputTest(String extension) { + super(BlockCompressedInputTest.class); + + this.extension = extension; + } + + public String toString() { + return String.format("File extension %s", extension); + } + } + + @DataProvider(name = "BlockCompressedInputDataProvider") + public Object[][] blockCompressedInputData() { + for (final String extension : AbstractFeatureReader.BLOCK_COMPRESSED_EXTENSIONS) + new BlockCompressedInputTest(".vcf" + extension); + + return TestDataProvider.getTests(BlockCompressedInputTest.class); + } + + @Test(dataProvider = "BlockCompressedInputDataProvider") + public void testBlockCompressedInput(BlockCompressedInputTest testSpec) { + + File inputFile = new File(BaseTest.privateTestDir, "block_compressed_input_test" + testSpec.extension); + final String commandLine = " -T SelectVariants" + + " -R " + b37KGReference + + " --no_cmdline_in_header" + + " -V " + inputFile + + " -o %s "; + final String name = "testBlockCompressedInput: " + testSpec.toString(); + + final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("3b60668bd973e43783d0406de80d2ed2")); + + executeTest(name, spec); + } + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java deleted file mode 100644 index ca579223e..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java +++ /dev/null @@ -1,568 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import htsjdk.tribble.Tribble; -import htsjdk.tribble.readers.LineIterator; -import htsjdk.tribble.readers.PositionalBufferedStream; -import htsjdk.tribble.util.TabixUtils; -import htsjdk.variant.bcf2.BCF2Codec; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFConstants; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import org.apache.log4j.AppenderSkeleton; -import org.apache.log4j.Level; -import org.apache.log4j.Logger; -import org.apache.log4j.PatternLayout; -import org.apache.log4j.spi.LoggingEvent; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.commandline.CommandLineUtils; -import org.broadinstitute.gatk.utils.crypt.CryptUtils; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.io.IOUtils; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import org.testng.Assert; -import org.testng.Reporter; -import org.testng.SkipException; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.util.*; - -/** - * - * User: aaron - * Date: Apr 14, 2009 - * Time: 10:24:30 AM - * - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - * - */ - - -/** - * @author aaron - * @version 1.0 - * @date Apr 14, 2009 - *

- * Class BaseTest - *

- * This is the base test class for all of our test cases. All test cases should extend from this - * class; it sets up the logger, and resolves the location of directories that we rely on. - */ -@SuppressWarnings("unchecked") -public abstract class BaseTest { - /** our log, which we want to capture anything from org.broadinstitute.sting */ - public static final Logger logger = CommandLineUtils.getStingLogger(); - - private static final String CURRENT_DIRECTORY = System.getProperty("user.dir"); - public static final String gatkDirectory = System.getProperty("gatkdir", CURRENT_DIRECTORY) + "/"; - public static final String baseDirectory = System.getProperty("basedir", CURRENT_DIRECTORY) + "/"; - public static final String testType = System.getProperty("testType"); // May be null - public static final String testTypeSubDirectory = testType == null ? "" : ("/" + testType); // May be empty - - public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; - public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; - public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; - //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; - public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; - public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; - public static final String hg19RefereneWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta"; - public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; - public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; - public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; - public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/"; - public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; - - public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; - public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; - public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf"; - - public static final String dbsnpDataLocation = GATKDataLocation; - public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf"; - public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf"; - public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; - public static final String b37dbSNP138 = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_138.b37.vcf"; - public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf"; - - public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; - public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; - - public static final String intervalsLocation = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/"; - public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; - public static final String hg19Chr20Intervals = GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; - - public static final boolean REQUIRE_NETWORK_CONNECTION = false; - private static final String networkTempDirRoot = "/broad/hptmp/"; - private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists(); - private static final File networkTempDirFile; - - private static final String privateTestDirRelative = "private/gatk-tools-private/src/test/resources/"; - public static final String privateTestDir = new File(gatkDirectory, privateTestDirRelative).getAbsolutePath() + "/"; - protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, ""); - - private static final String publicTestDirRelative = "public/gatk-engine/src/test/resources/"; - public static final String publicTestDir = new File(gatkDirectory, publicTestDirRelative).getAbsolutePath() + "/"; - protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, ""); - - public static final String keysDataLocation = validationDataLocation + "keys/"; - - public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; - - public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; - - public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; - public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam"; - - public static final boolean queueTestRunModeIsSet = System.getProperty("queuetest.run", "").equals("true"); - - /** before the class starts up */ - static { - // setup a basic log configuration - CommandLineUtils.configureConsoleLogging(); - - // setup our log layout - PatternLayout layout = new PatternLayout(); - layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n"); - - // now set the layout of all the loggers to our layout - CommandLineUtils.setLayout(logger, layout); - - // Set the Root logger to only output warnings. - logger.setLevel(Level.WARN); - - if (networkTempDirRootExists) { - networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name"))); - networkTempDirFile.deleteOnExit(); - } else { - networkTempDirFile = null; - } - - - if ( REQUIRE_NETWORK_CONNECTION ) { - // find our file sources - if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { - logger.fatal("We can't locate the reference directories. Aborting!"); - throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); - } - } - } - - /** - * Simple generic utility class to creating TestNG data providers: - * - * 1: inherit this class, as in - * - * private class SummarizeDifferenceTest extends TestDataProvider { - * public SummarizeDifferenceTest() { - * super(SummarizeDifferenceTest.class); - * } - * ... - * } - * - * Provide a reference to your class to the TestDataProvider constructor. - * - * 2: Create instances of your subclass. Return from it the call to getTests, providing - * the class type of your test - * - * - * {@literal @}DataProvider(name = "summaries") - * public Object[][] createSummaries() { - * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); - * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); - * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); - * } - * - * - * This class magically tracks created objects of this - */ - public static class TestDataProvider { - private static final Map> tests = new HashMap<>(); - protected String name; - - /** - * Create a new TestDataProvider instance bound to the class variable C - */ - public TestDataProvider(Class c, String name) { - if ( ! tests.containsKey(c) ) - tests.put(c, new ArrayList<>()); - tests.get(c).add(this); - this.name = name; - } - - public TestDataProvider(Class c) { - this(c, ""); - } - - public void setName(final String name) { - this.name = name; - } - - /** - * Return all of the data providers in the form expected by TestNG of type class C - * @param c - * @return - */ - public static Object[][] getTests(Class c) { - List params2 = new ArrayList(); - for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @Override - public String toString() { - return "TestDataProvider("+name+")"; - } - } - - /** - * test if the file exists - * - * @param file name as a string - * @return true if it exists - */ - public static boolean fileExist(String file) { - File temp = new File(file); - return temp.exists(); - } - - /** - * this appender looks for a specific message in the log4j stream. - * It can be used to verify that a specific message was generated to the logging system. - */ - public static class ValidationAppender extends AppenderSkeleton { - - private boolean foundString = false; - private String targetString = ""; - - public ValidationAppender(String target) { - targetString = target; - } - - @Override - protected void append(LoggingEvent loggingEvent) { - if (loggingEvent.getMessage().equals(targetString)) - foundString = true; - } - - public void close() { - // do nothing - } - - public boolean requiresLayout() { - return false; - } - - public boolean foundString() { - return foundString; - } - } - - /** - * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Prefix of the file. - * @param extension Extension to concat to the end of the file. - * @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits. - */ - public static File createTempFile(final String name, final String extension) { - try { - final File file = File.createTempFile(name, extension); - file.deleteOnExit(); - - // Mark corresponding indices for deletion on exit as well just in case an index is created for the temp file: - new File(file.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION).deleteOnExit(); - new File(file.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION).deleteOnExit(); - new File(file.getAbsolutePath() + ".bai").deleteOnExit(); - new File(file.getAbsolutePath().replaceAll(extension + "$", ".bai")).deleteOnExit(); - - return file; - } catch (IOException ex) { - throw new ReviewedGATKException("Cannot create temp file: " + ex.getMessage(), ex); - } - } - - /** - * Creates a temp list file that will be deleted on exit after tests are complete. - * @param tempFilePrefix Prefix of the file. - * @param lines lines to write to the file. - * @return A list file in the temporary directory starting with tempFilePrefix, which will be deleted after the program exits. - */ - public static File createTempListFile(final String tempFilePrefix, final String... lines) { - try { - final File tempListFile = createTempFile(tempFilePrefix, ".list"); - - final PrintWriter out = new PrintWriter(tempListFile); - for (final String line : lines) { - out.println(line); - } - out.close(); - - return tempListFile; - } catch (IOException ex) { - throw new ReviewedGATKException("Cannot create temp file: " + ex.getMessage(), ex); - } - } - - /** - * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Name of the file. - * @return A file in the network temporary directory with name, which will be deleted after the program exits. - * @throws SkipException when the network is not available. - */ - public static File tryCreateNetworkTempFile(String name) { - if (!networkTempDirRootExists) - throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot); - File file = new File(networkTempDirFile, name); - file.deleteOnExit(); - return file; - } - - /** - * Log this message so that it shows up inline during output as well as in html reports - * - * @param message - */ - public static void log(final String message) { - Reporter.log(message, true); - } - - private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected); - } - - public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { - Assert.assertTrue(actual instanceof Double, "Not a double"); - assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); - } - - public static final void assertEqualsDoubleSmart(final double actual, final double expected) { - assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); - } - - public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { - final Set actualSet = new HashSet(actual); - final Set expectedSet = new HashSet(expected); - Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { - assertEqualsDoubleSmart(actual, expected, tolerance, null); - } - - public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { - if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); - else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); - else { - final double delta = Math.abs(actual - expected); - final double ratio = Math.abs(actual / expected - 1.0); - Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual - + " not within tolerance " + tolerance - + (message == null ? "" : "message: " + message)); - } - } - - public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) { - Assert.assertNotNull(actual, "VariantContext expected not null"); - Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); - Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); - Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); - Assert.assertEquals(actual.getID(), expected.getID(), "id"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); - - assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); - Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); - assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); - assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); - - Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); - if ( expected.hasGenotypes() ) { - assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); - Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); - final Set samples = expected.getSampleNames(); - for ( final String sample : samples ) { - assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample)); - } - } - } - - public static void assertVariantContextStreamsAreEqual(final Iterable actual, final Iterable expected) { - final Iterator actualIT = actual.iterator(); - final Iterator expectedIT = expected.iterator(); - - while ( expectedIT.hasNext() ) { - final VariantContext expectedVC = expectedIT.next(); - if ( expectedVC == null ) - continue; - - VariantContext actualVC; - do { - Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); - actualVC = actualIT.next(); - } while ( actualIT.hasNext() && actualVC == null ); - - if ( actualVC == null ) - Assert.fail("Too few records in actual"); - - assertVariantContextsAreEqual(actualVC, expectedVC); - } - Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); - } - - - public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) { - Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); - Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); - Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); - - // filters are the same - Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); - Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); - - // inline attributes - Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); - Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); - Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); - Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); - Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); - Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); - Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); - - Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); - Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); - Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); - Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); - - Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); - assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); - Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); - Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); - } - - public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) { - Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); - - // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? - //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); - final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); - final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); - for ( int i = 0; i < actualLines.size(); i++ ) { - Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); - } - } - - public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { - final Pair> vcfData = GATKVCFUtils.readAllVCs(vcfFile, new VCFCodec()); - final Pair> bcfData = GATKVCFUtils.readAllVCs(bcfFile, new BCF2Codec()); - assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); - assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); - } - - private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { - if ( expected instanceof Double ) { - // must be very tolerant because doubles are being rounded to 2 sig figs - assertEqualsDoubleSmart(actual, (Double) expected, 1e-2); - } else - Assert.assertEquals(actual, expected, "Attribute " + key); - } - - private static void assertAttributesEquals(final Map actual, Map expected) { - final Set expectedKeys = new HashSet(expected.keySet()); - - for ( final Map.Entry act : actual.entrySet() ) { - final Object actualValue = act.getValue(); - if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { - final Object expectedValue = expected.get(act.getKey()); - if ( expectedValue instanceof List ) { - final List expectedList = (List)expectedValue; - Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); - final List actualList = (List)actualValue; - Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); - for ( int i = 0; i < expectedList.size(); i++ ) - assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); - } else - assertAttributeEquals(act.getKey(), actualValue, expectedValue); - } else { - // it's ok to have a binding in x -> null that's absent in y - Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); - } - expectedKeys.remove(act.getKey()); - } - - // now expectedKeys contains only the keys found in expected but not in actual, - // and they must all be null - for ( final String missingExpected : expectedKeys ) { - final Object value = expected.get(missingExpected); - Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); - } - } - - private static final boolean isMissing(final Object value) { - if ( value == null ) return true; - else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; - else if ( value instanceof List ) { - // handles the case where all elements are null or the list is empty - for ( final Object elt : (List)value) - if ( elt != null ) - return false; - return true; - } else - return false; - } - - /** - * Checks whether two double array contain the same values or not. - * @param actual actual produced array. - * @param expected expected array. - * @param tolerance maximum difference between double value to be consider equivalent. - */ - protected static void assertEqualsDoubleArray(final double[] actual, final double[] expected, final double tolerance) { - if (expected == null) - Assert.assertNull(actual); - else { - Assert.assertNotNull(actual); - Assert.assertEquals(actual.length,expected.length,"array length"); - } - for (int i = 0; i < actual.length; i++) - Assert.assertEquals(actual[i],expected[i],tolerance,"array position " + i); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BaseUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BaseUtilsUnitTest.java deleted file mode 100644 index b532bafa9..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BaseUtilsUnitTest.java +++ /dev/null @@ -1,179 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Random; - - -public class BaseUtilsUnitTest extends BaseTest { - @BeforeClass - public void init() { } - - @Test - public void testMostFrequentBaseFraction() { - logger.warn("Executing testMostFrequentBaseFraction"); - - compareFrequentBaseFractionToExpected("AAAAA", 1.0); - compareFrequentBaseFractionToExpected("ACCG", 0.5); - compareFrequentBaseFractionToExpected("ACCCCTTTTG", 4.0/10.0); - } - - private void compareFrequentBaseFractionToExpected(String sequence, double expected) { - double fraction = BaseUtils.mostFrequentBaseFraction(sequence.getBytes()); - Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0); - } - - @Test - public void testConvertIUPACtoN() { - - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false, false), new byte[]{'A', 'A', 'A'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false, false), new byte[]{'N', 'A', 'A'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false, false), new byte[]{'A', 'N', 'A'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false, false), new byte[]{'A', 'A', 'N'}); - checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false, false), new byte[]{'N', 'N', 'N'}); - } - - private void checkBytesAreEqual(final byte[] b1, final byte[] b2) { - for ( int i = 0; i < b1.length; i++ ) - Assert.assertEquals(b1[i], b2[i]); - } - - @Test - public void testConvertBasesToIUPAC() { - - for ( final BaseUtils.Base b : BaseUtils.Base.values() ) { - if ( BaseUtils.isRegularBase(b.base) ) - Assert.assertEquals(BaseUtils.basesToIUPAC(b.base, b.base), b.base, "testing same base"); - } - - Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'X'), 'N', "testing non-standard base"); - Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'A'), 'N', "testing non-standard base"); - Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'X'), 'N', "testing non-standard base"); - - Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'T'), 'W', "testing A/T=W"); - Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'T', (byte)'A'), 'W', "testing T/A=W"); - Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'G', (byte) 'T'), 'K', "testing G/T=K"); - Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'T', (byte) 'G'), 'K', "testing T/G=K"); - } - - @Test - public void testTransitionTransversion() { - logger.warn("Executing testTransitionTransversion"); - - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'G' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'C', (byte)'A' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'C', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'C', (byte)'G' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'T', (byte)'A' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'T', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'T', (byte)'G' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'G', (byte)'A' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'G', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'G', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'t' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'c' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'t' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'c' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); - } - - @Test - public void testReverseComplementString() { - logger.warn("Executing testReverseComplementString"); - - compareRCStringToExpected("ACGGT", "ACCGT"); - compareRCStringToExpected("TCGTATATCTCGCTATATATATATAGCTCTAGTATA", "TATACTAGAGCTATATATATATAGCGAGATATACGA"); - compareRCStringToExpected("AAAN", "NTTT"); - } - - private void compareRCStringToExpected(String fw, String rcExp) { - String rcObs = BaseUtils.simpleReverseComplement(fw); - - Assert.assertTrue(rcObs.equals(rcExp)); - } - - @Test(dataProvider="baseComparatorData") - public void testBaseComparator(final Collection basesToSort) { - final ArrayList sorted = new ArrayList<>(basesToSort); - Collections.sort(sorted, BaseUtils.BASES_COMPARATOR); - for (int i = 0; i < sorted.size(); i++) { - Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(i)),0); - final String iString = new String(sorted.get(i)); - for (int j = i; j < sorted.size(); j++) { - final String jString = new String(sorted.get(j)); - if (iString.compareTo(jString) == 0) - Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)),0); - else - Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) * iString.compareTo(jString) > 0); - Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) <= 0); - } - } - } - - @DataProvider(name="baseComparatorData") - public Object[][] baseComparatorData() { - final int testCount = 10; - final int testSizeAverage = 10; - final int testSizeDeviation = 10; - final int haplotypeSizeAverage = 100; - final int haplotypeSizeDeviation = 100; - - final Object[][] result = new Object[testCount][]; - - GenomeAnalysisEngine.resetRandomGenerator(); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - for (int i = 0; i < testCount; i++) { - final int size = (int) Math.max(0,rnd.nextDouble() * testSizeDeviation + testSizeAverage); - final ArrayList bases = new ArrayList<>(size); - for (int j = 0; j < size; j++) { - final int jSize = (int) Math.max(0,rnd.nextDouble() * haplotypeSizeDeviation + haplotypeSizeAverage); - final byte[] b = new byte[jSize]; - for (int k = 0; k < jSize; k++) - b[k] = BaseUtils.baseIndexToSimpleBase(rnd.nextInt(4)); - bases.add(b); - } - result[i] = new Object[] { bases }; - } - return result; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BitSetUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BitSetUtilsUnitTest.java deleted file mode 100644 index 87a5914a3..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/BitSetUtilsUnitTest.java +++ /dev/null @@ -1,85 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.Random; - -/** - * @author Mauricio Carneiro - * @since 3/5/12 - */ - -public class BitSetUtilsUnitTest { - private static int RANDOM_NUMBERS_TO_TRY = 87380; - private static Random random; - - @BeforeClass - public void init() { - random = GenomeAnalysisEngine.getRandomGenerator(); - } - - @Test(enabled = true) - public void testLongBitSet() { - long[] numbers = {0L, 1L, 428L, 65536L, 239847L, 4611686018427387903L, Long.MAX_VALUE, Long.MIN_VALUE, -1L, -2L, -7L, -128L, -65536L, -100000L}; - for (long n : numbers) - Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); - - for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) { - long n = random.nextLong(); - Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); // Because class Random uses a seed with only 48 bits, this algorithm will not return all possible long values. - } - } - - @Test(enabled = true) - public void testShortBitSet() { - short[] numbers = {0, 1, 428, 25934, 23847, 16168, Short.MAX_VALUE, Short.MIN_VALUE, -1, -2, -7, -128, -12312, -31432}; - for (long n : numbers) - Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n); - - for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) { - short n = (short) random.nextInt(); - Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n); - } - } - - @Test(enabled = false) - public void testDNAAndBitSetConversion() { - String[] dna = {"AGGTGTTGT", "CCCCCCCCCCCCCC", "GGGGGGGGGGGGGG", "TTTTTTTTTTTTTT", "GTAGACCGATCTCAGCTAGT", "AACGTCAATGCAGTCAAGTCAGACGTGGGTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"}; - - // Test all contexts of size 1-8. - //for (long n = 0; n < RANDOM_NUMBERS_TO_TRY; n++) - // Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(ContextCovariate.contextFromKey(BitSetUtils.bitSetFrom(n)))), n); - - // Test the special cases listed in the dna array - //for (String d : dna) - // Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MD5DB.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MD5DB.java deleted file mode 100644 index d7c992906..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MD5DB.java +++ /dev/null @@ -1,312 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import org.apache.commons.io.FileUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.engine.walkers.diffengine.DiffEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; - -import java.io.*; -import java.util.Arrays; - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 7/18/11 - * Time: 9:10 AM - * - * Utilities for manipulating the MD5 database of previous results - */ -public class MD5DB { - public static final Logger logger = Logger.getLogger(MD5DB.class); - - /** - * Subdirectory under the ant build directory where we store integration test md5 results - */ - private static final int MAX_RECORDS_TO_READ = 1000000; - private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1; - public static final String LOCAL_MD5_DB_DIR = "integrationtests"; - public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; - - // tracking and emitting a data file of origina and new md5s - private final File MD5MismatchesFile; - private final PrintStream md5MismatchStream; - - public MD5DB() { - this(new File(MD5DB.LOCAL_MD5_DB_DIR + "/md5mismatches.txt")); - } - - public MD5DB(final File MD5MismatchesFile) { - this.MD5MismatchesFile = MD5MismatchesFile; - - ensureMd5DbDirectory(); - - logger.debug("Creating md5 mismatch db at " + MD5MismatchesFile); - try { - md5MismatchStream = new PrintStream(new FileOutputStream(MD5MismatchesFile)); - md5MismatchStream.printf("%s\t%s\t%s%n", "expected", "observed", "test"); - } catch ( FileNotFoundException e ) { - throw new ReviewedGATKException("Failed to open md5 mismatch file", e); - } - - } - - public void close() { - if ( md5MismatchStream != null ) { - logger.debug("Closeing md5 mismatch db at " + MD5MismatchesFile); - md5MismatchStream.close(); - } - } - - // ---------------------------------------------------------------------- - // - // MD5 DB stuff - // - // ---------------------------------------------------------------------- - - /** - * Create the MD5 file directories if necessary - */ - private void ensureMd5DbDirectory() { - File dir = new File(LOCAL_MD5_DB_DIR); - if ( ! dir.exists() ) { - System.out.printf("##### Creating MD5 db %s%n", LOCAL_MD5_DB_DIR); - if ( ! dir.mkdir() ) { - // Need to check AGAIN whether the dir exists, because we might be doing multi-process parallelism - // within the same working directory, and another GATK instance may have come along and created the - // directory between the calls to exists() and mkdir() above. - if ( ! dir.exists() ) { - throw new ReviewedGATKException("Infrastructure failure: failed to create md5 directory " + LOCAL_MD5_DB_DIR); - } - } - } - } - - /** - * Returns the path to an already existing file with the md5 contents, or valueIfNotFound - * if no such file exists in the db. - * - * @param md5 - * @param valueIfNotFound - * @return - */ - public String getMD5FilePath(final String md5, final String valueIfNotFound) { - // we prefer the global db to the local DB, so match it first - for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) { - File f = getFileForMD5(md5, dir); - if ( f.exists() && f.canRead() ) - return f.getAbsolutePath(); - } - - return valueIfNotFound; - } - - /** - * Utility function that given a file's md5 value and the path to the md5 db, - * returns the canonical name of the file. For example, if md5 is XXX and db is YYY, - * this will return YYY/XXX.integrationtest - * - * @param md5 - * @param dbPath - * @return - */ - private File getFileForMD5(final String md5, final String dbPath) { - final String basename = String.format("%s.integrationtest", md5); - return new File(dbPath + "/" + basename); - } - - /** - * Copies the results file with md5 value to its canonical file name and db places - * - * @param md5 - * @param resultsFile - */ - private void updateMD5Db(final String md5, final File resultsFile) { - copyFileToDB(getFileForMD5(md5, LOCAL_MD5_DB_DIR), resultsFile); - copyFileToDB(getFileForMD5(md5, GLOBAL_MD5_DB_DIR), resultsFile); - } - - /** - * Low-level utility routine that copies resultsFile to dbFile - * @param dbFile - * @param resultsFile - */ - private void copyFileToDB(File dbFile, final File resultsFile) { - if ( ! dbFile.exists() ) { - // the file isn't already in the db, copy it over - System.out.printf("##### Updating MD5 file: %s%n", dbFile.getPath()); - try { - FileUtils.copyFile(resultsFile, dbFile); - } catch ( IOException e ) { - System.out.printf("##### Skipping update, cannot write file %s%n", dbFile); - } - } else { - //System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); - } - } - - /** - * Returns the byte[] of the entire contents of file, for md5 calculations - * @param file - * @return - * @throws IOException - */ - private static byte[] getBytesFromFile(File file) throws IOException { - InputStream is = new FileInputStream(file); - - // Get the size of the file - long length = file.length(); - - if (length > Integer.MAX_VALUE) { - // File is too large - } - - // Create the byte array to hold the data - byte[] bytes = new byte[(int) length]; - - // Read in the bytes - int offset = 0; - int numRead = 0; - while (offset < bytes.length - && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) { - offset += numRead; - } - - // Ensure all the bytes have been read in - if (offset < bytes.length) { - throw new IOException("Could not completely read file " + file.getName()); - } - - // Close the input stream and return bytes - is.close(); - return bytes; - } - - public static class MD5Match { - public final String actualMD5, expectedMD5; - public final String failMessage; - public final String diffEngineOutput; - public final boolean failed; - - public MD5Match(final String actualMD5, final String expectedMD5, final String failMessage, final String diffEngineOutput, final boolean failed) { - this.actualMD5 = actualMD5; - this.expectedMD5 = expectedMD5; - this.failMessage = failMessage; - this.diffEngineOutput = diffEngineOutput; - this.failed = failed; - } - } - - /** - * Tests a file MD5 against an expected value, returning an MD5Match object containing a description of the - * match or mismatch. In case of a mismatch, outputs a description of the mismatch to various log files/streams. - * - * NOTE: This function WILL NOT throw an exception if the MD5s are different. - * - * @param testName Name of the test. - * @param testClassName Name of the class that contains the test. - * @param resultsFile File to MD5. - * @param expectedMD5 Expected MD5 value. - * @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text. - * @return an MD5Match object containing a description of the match/mismatch. Will have its "failed" field set - * to true if there was a mismatch (unless we're using the "parameterize" argument) - */ - public MD5Match testFileMD5(final String testName, final String testClassName, final File resultsFile, final String expectedMD5, final boolean parameterize) { - final String actualMD5 = calculateFileMD5(resultsFile); - String diffEngineOutput = ""; - String failMessage = ""; - boolean failed = false; - - // copy md5 to integrationtests - updateMD5Db(actualMD5, resultsFile); - - if (parameterize || expectedMD5.equals("")) { - BaseTest.log(String.format("PARAMETERIZATION: file %s has md5 = %s", resultsFile, actualMD5)); - } else if ( ! expectedMD5.equals(actualMD5) ) { - failed = true; - failMessage = String.format("%s:%s has mismatching MD5s: expected=%s observed=%s", testClassName, testName, expectedMD5, actualMD5); - diffEngineOutput = logMD5MismatchAndGetDiffEngineOutput(testName, testClassName, expectedMD5, actualMD5); - } - - return new MD5Match(actualMD5, expectedMD5, failMessage, diffEngineOutput, failed); - } - - /** - * Calculates the MD5 for the specified file and returns it as a String - * - * @param file file whose MD5 to calculate - * @return file's MD5 in String form - * @throws RuntimeException if the file could not be read - */ - public String calculateFileMD5( final File file ) { - try { - return Utils.calcMD5(getBytesFromFile(file)); - } - catch ( Exception e ) { - throw new RuntimeException("Failed to read bytes from file: " + file + " for MD5 calculation", e); - } - } - - /** - * Logs a description (including diff engine output) of the MD5 mismatch between the expectedMD5 - * and actualMD5 to a combination of BaseTest.log(), the md5MismatchStream, and stdout, then returns - * the diff engine output. - * - * @param testName name of the test that generated the mismatch - * @param testClassName name of the class containing the test that generated the mismatch - * @param expectedMD5 the MD5 we were expecting from this test - * @param actualMD5 the MD5 we actually calculated from the test output - * @return the diff engine output produced while logging the description of the mismatch - */ - private String logMD5MismatchAndGetDiffEngineOutput(final String testName, final String testClassName, final String expectedMD5, final String actualMD5) { - System.out.printf("##### Test %s:%s is going to fail #####%n", testClassName, testName); - String pathToExpectedMD5File = getMD5FilePath(expectedMD5, "[No DB file found]"); - String pathToFileMD5File = getMD5FilePath(actualMD5, "[No DB file found]"); - BaseTest.log(String.format("expected %s", expectedMD5)); - BaseTest.log(String.format("calculated %s", actualMD5)); - BaseTest.log(String.format("diff %s %s", pathToExpectedMD5File, pathToFileMD5File)); - - md5MismatchStream.printf("%s\t%s\t%s%n", expectedMD5, actualMD5, testName); - md5MismatchStream.flush(); - - // inline differences - String diffEngineOutput = ""; - final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - final PrintStream ps = new PrintStream(baos); - DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false); - boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); - if ( success ) { - diffEngineOutput = baos.toString(); - BaseTest.log(diffEngineOutput); - System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R " + BaseTest.publicTestDir + "exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", - pathToExpectedMD5File, pathToFileMD5File); - } - ps.close(); - - return diffEngineOutput; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SampleUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SampleUtilsUnitTest.java deleted file mode 100644 index d11c4bf57..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SampleUtilsUnitTest.java +++ /dev/null @@ -1,52 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import org.apache.commons.io.FileUtils; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.io.IOUtils; -import org.testng.Assert; -import org.broadinstitute.gatk.utils.BaseTest; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -/** - * Testing framework for sample utilities class. - * - * @author gauthier - */ - -public class SampleUtilsUnitTest extends BaseTest { - @Test(expectedExceptions=UserException.class) - public void testBadSampleFiles() throws Exception { - Set sampleFiles = new HashSet(0); - sampleFiles.add(new File("fileNotHere.samples")); - Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtilsUnitTest.java deleted file mode 100644 index 6ccb0c976..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtilsUnitTest.java +++ /dev/null @@ -1,241 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMSequenceRecord; -import org.apache.log4j.Logger; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import static org.broadinstitute.gatk.utils.SequenceDictionaryUtils.*; -import static org.broadinstitute.gatk.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class SequenceDictionaryUtilsUnitTest extends BaseTest { - - private static Logger logger = Logger.getLogger(SequenceDictionaryUtilsUnitTest.class); - - - @DataProvider( name = "SequenceDictionaryDataProvider" ) - public Object[][] generateSequenceDictionaryTestData() { - final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); - final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); - final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308); - - final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class; - final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; - - final List hg19Sequences = Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR10_HG19); - final GenomeLocParser hg19GenomeLocParser = new GenomeLocParser(new SAMSequenceDictionary(hg19Sequences)); - final List hg19AllContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1), - hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1), - hg19GenomeLocParser.createGenomeLoc("chr2", 0, 1), - hg19GenomeLocParser.createGenomeLoc("chr10", 0, 1)); - final List hg19PartialContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1), - hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1)); - final GenomeLocSortedSet hg19AllContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19AllContigsIntervals); - final GenomeLocSortedSet hg19PartialContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19PartialContigsIntervals); - - return new Object[][] { - // Identical dictionaries: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), null, IDENTICAL, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, IDENTICAL, null, false, null }, - { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), null, IDENTICAL, null, false, null }, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, IDENTICAL, null, false, null }, - - // Dictionaries with a common subset: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null }, - - // Dictionaries with no common contigs: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, - - // Dictionaries with unequal common contigs: - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, - - // One or both dictionaries in non-canonical human order: - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, - - // Dictionaries with a common subset, but different relative ordering within that subset: - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, - { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, - - - // Dictionaries with a common subset in the same relative order, but with different indices. - // This will only throw an exception during validation if isReadsToReferenceComparison is true, - // and there are intervals overlapping the misindexed contigs: - - // These have isReadsToReferenceComparison == true and overlapping intervals, so we expect an exception: - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, - - // These have isReadsToReferenceComparison == true but no overlapping intervals, so we don't expect an exception: - { Arrays.asList(CHR2_HG19, CHR10_HG19), Arrays.asList(CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet }, - - // These have isReadsToReferenceComparison == false, so we don't expect an exception: - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet }, - { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet }, - - - // Tests for validation exclusions. Note that errors resulting from NO_COMMON_CONTIGs cannot be suppressed - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALL, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, UNEQUAL_COMMON_CONTIGS, null, false, null }, - { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALL, UNEQUAL_COMMON_CONTIGS, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NON_CANONICAL_HUMAN_ORDER, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALL, NON_CANONICAL_HUMAN_ORDER, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, OUT_OF_ORDER, null, false, null }, - { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALL, OUT_OF_ORDER, null, false, null }, - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet }, - { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALL, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet } - }; - } - - @Test( dataProvider = "SequenceDictionaryDataProvider" ) - public void testSequenceDictionaryValidation( final List firstDictionaryContigs, - final List secondDictionaryContigs, - final ValidationExclusion.TYPE validationExclusions, - final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, - final Class expectedExceptionUponValidation, - final boolean isReadsToReferenceComparison, - final GenomeLocSortedSet intervals ) { - - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - final String testDescription = String.format("First dictionary: %s Second dictionary: %s Validation exclusions: %s", - SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), - SequenceDictionaryUtils.getDictionaryAsString(secondDictionary), - validationExclusions); - - Exception exceptionThrown = null; - try { - SequenceDictionaryUtils.validateDictionaries(logger, - validationExclusions, - "firstDictionary", - firstDictionary, - "secondDictionary", - secondDictionary, - isReadsToReferenceComparison, - intervals); - } - catch ( Exception e ) { - exceptionThrown = e; - } - - if ( expectedExceptionUponValidation != null ) { - Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown), - String.format("Expected exception %s but saw %s instead. %s", - expectedExceptionUponValidation.getSimpleName(), - exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(), - testDescription)); - } - else { - Assert.assertTrue(exceptionThrown == null, - String.format("Expected no exception but saw exception %s instead. %s", - exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none", - testDescription)); - } - } - - @Test( dataProvider = "SequenceDictionaryDataProvider" ) - public void testSequenceDictionaryComparison( final List firstDictionaryContigs, - final List secondDictionaryContigs, - final ValidationExclusion.TYPE validationExclusions, - final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, - final Class expectedExceptionUponValidation, - final boolean isReadsToReferenceComparison, - final GenomeLocSortedSet intervals ) { - - final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); - final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); - final String testDescription = String.format("First dictionary: %s Second dictionary: %s", - SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), - SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); - - final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = - SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary); - - Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, - String.format("Dictionary comparison should have returned %s but instead returned %s. %s", - dictionaryCompatibility, reportedCompatibility, testDescription)); - } - - private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { - final List clonedContigs = new ArrayList(contigs.size()); - - // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects - // across multiple dictionaries in tests - for ( SAMSequenceRecord contig : contigs ) { - clonedContigs.add(contig.clone()); - } - - return new SAMSequenceDictionary(clonedContigs); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java deleted file mode 100644 index a303f2c8b..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java +++ /dev/null @@ -1,363 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils; - -import org.apache.commons.io.FileUtils; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.io.IOUtils; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -/** - * Testing framework for general purpose utilities class. - * - * @author hanna - * @version 0.1 - */ - -public class UtilsUnitTest extends BaseTest { - @Test - public void testAppend() { - for ( int leftSize : Arrays.asList(0, 1, 2, 3) ) { - for ( final int rightSize : Arrays.asList(0, 1, 2) ) { - final List left = new LinkedList(); - for ( int i = 0; i < leftSize; i++ ) left.add(i); - final List total = new LinkedList(); - for ( int i = 0; i < leftSize + rightSize; i++ ) total.add(i); - - if ( rightSize == 0 ) - Assert.assertEquals(Utils.append(left), total); - if ( rightSize == 1 ) - Assert.assertEquals(Utils.append(left, leftSize), total); - if ( rightSize == 2 ) - Assert.assertEquals(Utils.append(left, leftSize, leftSize + 1), total); - } - } - - } - - @Test - public void testDupStringNoChars() { - String duped = Utils.dupString('a',0); - Assert.assertEquals(duped.length(), 0, "dupString did not produce zero-length string"); - } - - @Test - public void testDupStringOneChar() { - String duped = Utils.dupString('b',1); - Assert.assertEquals(duped.length(), 1, "dupString did not produce single character string"); - Assert.assertEquals(duped.charAt(0), 'b', "dupString character was incorrect"); - } - - @Test - public void testXor() { - Assert.assertEquals(Utils.xor(false, false), false, "xor F F failed"); - Assert.assertEquals(Utils.xor(false, true), true, "xor F T failed"); - Assert.assertEquals(Utils.xor(true, false), true, "xor T F failed"); - Assert.assertEquals(Utils.xor(true, true), false, "xor T T failed"); - } - - @Test - public void testDupStringMultiChar() { - String duped = Utils.dupString('c',5); - Assert.assertEquals(duped.length(), 5, "dupString did not produce five character string"); - Assert.assertEquals(duped,"ccccc","dupString string was incorrect"); - } - - @Test - public void testJoinMap() { - Map map = new LinkedHashMap(); - map.put("one",1); - map.put("two",2); - String joined = Utils.joinMap("-",";",map); - Assert.assertTrue("one-1;two-2".equals(joined)); - } - - @Test - public void testJoinMapLargerSet() { - Map map = new LinkedHashMap(); - map.put("one",1); - map.put("two",2); - map.put("three",1); - map.put("four",2); - map.put("five",1); - map.put("six",2); - String joined = Utils.joinMap("-",";",map); - Assert.assertTrue("one-1;two-2;three-1;four-2;five-1;six-2".equals(joined)); - } - - @Test - public void testConcat() { - final String s1 = "A"; - final String s2 = "CC"; - final String s3 = "TTT"; - final String s4 = "GGGG"; - Assert.assertEquals(new String(Utils.concat()), ""); - Assert.assertEquals(new String(Utils.concat(s1.getBytes())), s1); - Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes())), s1 + s2); - Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes())), s1 + s2 + s3); - Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes(), s4.getBytes())), s1 + s2 + s3 + s4); - } - - @Test - public void testEscapeExpressions() { - String[] expected, actual; - - expected = new String[] {"one", "two", "three"}; - actual = Utils.escapeExpressions("one two three"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two three"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions("one two three "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two three "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two three "); - Assert.assertEquals(actual, expected); - - expected = new String[] {"one", "two", "three four", "five", "six"}; - actual = Utils.escapeExpressions("one two 'three four' five six"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two 'three four' five six"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions("one two 'three four' five six "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two 'three four' five six "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two 'three four' five six "); - Assert.assertEquals(actual, expected); - - expected = new String[] {"one two", "three", "four"}; - actual = Utils.escapeExpressions("'one two' three four"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" 'one two' three four"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions("'one two' three four "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" 'one two' three four "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" 'one two' three four "); - Assert.assertEquals(actual, expected); - - expected = new String[] {"one", "two", "three four"}; - actual = Utils.escapeExpressions("one two 'three four'"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two 'three four'"); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions("one two 'three four' "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two 'three four' "); - Assert.assertEquals(actual, expected); - actual = Utils.escapeExpressions(" one two 'three four' "); - Assert.assertEquals(actual, expected); - } - - @Test(dataProvider = "asIntegerListData") - public void testAsIntegerList(final int[] values) { - if (values == null) { - try { - Utils.asList((int[]) null); - Assert.fail("Should have thrown an exception"); - } catch (final IllegalArgumentException ex) { - // good. - } - } else { - final Random rdn = GenomeAnalysisEngine.getRandomGenerator(); - final int[] valuesClone = values.clone(); - final List list = Utils.asList(valuesClone); - Assert.assertNotNull(list); - Assert.assertEquals(list.size(),values.length); - for (int i = 0; i < values.length; i++) - Assert.assertEquals((int) list.get(i),values[i]); - for (int i = 0; i < values.length; i++) - valuesClone[rdn.nextInt(values.length)] = rdn.nextInt(1000); - for (int i = 0; i < values.length; i++) - Assert.assertEquals((int) list.get(i),valuesClone[i]); - } - } - - @Test(dataProvider = "asDoubleListData") - public void testAsDoubleList(final double[] values) { - if (values == null) { - try { - Utils.asList((int[]) null); - Assert.fail("Should have thrown an exception"); - } catch (final IllegalArgumentException ex) { - // good. - } - } else { - final Random rdn = GenomeAnalysisEngine.getRandomGenerator(); - final double[] valuesClone = values.clone(); - final List list = Utils.asList(valuesClone); - Assert.assertNotNull(list); - Assert.assertEquals(list.size(),values.length); - for (int i = 0; i < values.length; i++) - Assert.assertEquals((double) list.get(i),values[i]); - for (int i = 0; i < values.length; i++) - valuesClone[rdn.nextInt(values.length)] = rdn.nextDouble() * 1000; - for (int i = 0; i < values.length; i++) - Assert.assertEquals((double) list.get(i),valuesClone[i]); - } - } - - @Test - public void testCalcMD5() throws Exception { - final File source = new File(publicTestDir + "exampleFASTA.fasta"); - final String sourceMD5 = "36880691cf9e4178216f7b52e8d85fbe"; - - final byte[] sourceBytes = IOUtils.readFileIntoByteArray(source); - Assert.assertEquals(Utils.calcMD5(sourceBytes), sourceMD5); - - final String sourceString = FileUtils.readFileToString(source); - Assert.assertEquals(Utils.calcMD5(sourceString), sourceMD5); - } - - @Test - public void testLongestCommonOps() { - for ( int prefixLen = 0; prefixLen < 20; prefixLen++ ) { - for ( int extraSeq1Len = 0; extraSeq1Len < 10; extraSeq1Len++ ) { - for ( int extraSeq2Len = 0; extraSeq2Len < 10; extraSeq2Len++ ) { - for ( int max = 0; max < 50; max++ ) { - final String prefix = Utils.dupString("A", prefixLen); - final int expected = Math.min(prefixLen, max); - - { - final String seq1 = prefix + Utils.dupString("C", extraSeq1Len); - final String seq2 = prefix + Utils.dupString("G", extraSeq1Len); - Assert.assertEquals(Utils.longestCommonPrefix(seq1.getBytes(), seq2.getBytes(), max), expected, "LongestCommonPrefix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max); - } - - { - final String seq1 = Utils.dupString("C", extraSeq1Len) + prefix; - final String seq2 = Utils.dupString("G", extraSeq1Len) + prefix; - Assert.assertEquals(Utils.longestCommonSuffix(seq1.getBytes(), seq2.getBytes(), max), expected, "longestCommonSuffix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max); - } - } - } - } - } - } - - @DataProvider(name = "trim") - public Object[][] createTrimTestData() { - List tests = new ArrayList(); - - final String s = "AAAA"; - for ( int front = 0; front < s.length(); front++ ) { - for ( int back = 0; back < s.length(); back++ ) { - if ( front + back <= s.length() ) - tests.add(new Object[]{s, front, back}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "trim", enabled = true) - public void testTrim(final String s, final int frontTrim, final int backTrim) { - Assert.assertEquals(s.length() - frontTrim - backTrim, Utils.trimArray(s.getBytes(), frontTrim, backTrim).length); - } - - @Test(dataProvider = "equalRangeData", enabled = true) - public void testEqualRange(final byte[] array1, final byte[] array2, final int offset1, final int offset2, final int length, final boolean expected) { - Assert.assertEquals(Utils.equalRange(array1,offset1,array2,offset2,length),expected); - Assert.assertTrue(Utils.equalRange(array1,offset1,array1,offset1,length)); - Assert.assertTrue(Utils.equalRange(array2,offset2,array2,offset2,length)); - - } - - @DataProvider(name = "equalRangeData") - public Object[][] equalRangeData() { - return new Object[][] { - new Object[] { new byte[0] , new byte[0], 0, 0, 0, true}, - new Object[] { "ABCF".getBytes(), "BC".getBytes(), 1,0,2, true }, - new Object[] { "ABCF".getBytes(), "".getBytes(), 1,0,0, true }, - new Object[] { "ABCF".getBytes(), "ACBF".getBytes(), 0,0, 4, false} - }; - - } - - @Test(dataProvider = "skimArrayData") - public void testSkimArray(final String original, final String remove) { - final StringBuilder resultBuilder = new StringBuilder(); - final boolean[] removeBoolean = new boolean[remove.length()]; - for (int i = 0; i < original.length(); i++) - if (remove.charAt(i) == '1') { - resultBuilder.append(original.charAt(i)); - removeBoolean[i] = false; - } else - removeBoolean[i] = true; - - final String expected = resultBuilder.toString(); - final byte[] resultBytes = Utils.skimArray(original.getBytes(),removeBoolean); - final String resultString = new String(resultBytes); - Assert.assertEquals(resultString,expected); - } - - @DataProvider(name = "skimArrayData") - public Object[][] skimArrayData() { - return new Object[][] { - {"romeo+juliette" , "11111111111111" }, - {"romeo+juliette" , "11111011111111" }, - {"romeo+juliette" , "00000011111111" }, - {"romeo+juliette" , "11111100000000" }, - {"romeo+juliette" , "11111011111111" }, - {"romeo+juliette" , "01111010000001" }, - {"romeo+juliette" , "01100110000110" }, - {"romeo+juliette" , "10101010101010" }, - {"romeo+juliette" , "01010101010101" }, - {"romeo+juliette" , "01111010111001" }, - }; - } - - - @DataProvider(name = "asIntegerListData") - public Object[][] asIntegerListData() { - return new Object[][] { - { null }, - {new int[0]}, - {new int[]{1, 2, 3, 4, 5}}, - {new int[]{2}}, - {new int[]{3,4}} - }; - } - - @DataProvider(name = "asDoubleListData") - public Object[][] asDoubleListData() { - return new Object[][] { - { null }, - {new double[0]}, - {new double[]{1, 2, 3, 4, 5}}, - {new double[]{2}}, - {new double[]{3,4}}, - {new double[]{Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY}} - }; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java deleted file mode 100644 index 2087d9a0c..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java +++ /dev/null @@ -1,339 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.activeregion; - - -// the imports for unit testing. - - -import htsjdk.samtools.reference.ReferenceSequenceFile; -import org.apache.commons.lang.ArrayUtils; -import htsjdk.tribble.readers.LineIterator; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.MathUtils; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.gatk.utils.variant.GATKVCFUtils; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.vcf.VCFCodec; -import htsjdk.variant.vcf.VCFHeader; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - - -public class BandPassActivityProfileUnitTest extends BaseTest { - private final static boolean DEBUG = false; - private GenomeLocParser genomeLocParser; - - private final static int MAX_PROB_PROPAGATION_DISTANCE = 50; - private final static double ACTIVE_PROB_THRESHOLD= 0.002; - - @BeforeClass - public void init() throws FileNotFoundException { - // sequence - ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - genomeLocParser = new GenomeLocParser(seq); - } - - @DataProvider(name = "BandPassBasicTest") - public Object[][] makeBandPassTest() { - final List tests = new LinkedList(); - - for ( int start : Arrays.asList(1, 10, 100, 1000) ) { - for ( boolean precedingIsActive : Arrays.asList(true, false) ) { - for ( int precedingSites: Arrays.asList(0, 1, 10, 100) ) { - for ( int bandPassSize : Arrays.asList(0, 1, 10, 100) ) { - for ( double sigma : Arrays.asList(1.0, 2.0, BandPassActivityProfile.DEFAULT_SIGMA) ) { -// for ( int start : Arrays.asList(10) ) { -// for ( boolean precedingIsActive : Arrays.asList(false) ) { -// for ( int precedingSites: Arrays.asList(0) ) { -// for ( int bandPassSize : Arrays.asList(1) ) { - tests.add(new Object[]{ start, precedingIsActive, precedingSites, bandPassSize, sigma }); - } - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest") - public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) { - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, bandPassSize, sigma, false); - - final int expectedBandSize = bandPassSize * 2 + 1; - Assert.assertEquals(profile.getFilteredSize(), bandPassSize, "Wrong filter size"); - Assert.assertEquals(profile.getSigma(), sigma, "Wrong sigma"); - Assert.assertEquals(profile.getBandSize(), expectedBandSize, "Wrong expected band size"); - - final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); - final double precedingProb = precedingIsActive ? 1.0 : 0.0; - for ( int i = 0; i < nPrecedingSites; i++ ) { - final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); - final ActivityProfileState state = new ActivityProfileState(loc, precedingProb); - profile.add(state); - } - - final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start); - profile.add(new ActivityProfileState(nextLoc, 1.0)); - - if ( precedingIsActive == false && nPrecedingSites >= bandPassSize && bandPassSize < start ) { - // we have enough space that all probs fall on the genome - final double[] probs = profile.getProbabilitiesAsArray(); - Assert.assertEquals(MathUtils.sum(probs), 1.0 * (nPrecedingSites * precedingProb + 1), 1e-3, "Activity profile doesn't sum to number of non-zero prob states"); - } - } - - private double[] bandPassInOnePass(final BandPassActivityProfile profile, final double[] activeProbArray) { - final double[] bandPassProbArray = new double[activeProbArray.length]; - - // apply the band pass filter for activeProbArray into filteredProbArray - final double[] GaussianKernel = profile.getKernel(); - for( int iii = 0; iii < activeProbArray.length; iii++ ) { - final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii)); - final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1)); - bandPassProbArray[iii] = dotProduct(activeProbSubArray, kernel); - } - - return bandPassProbArray; - } - - public static double dotProduct(double[] v1, double[] v2) { - Assert.assertEquals(v1.length,v2.length,"Array lengths do not mach in dotProduct"); - double result = 0.0; - for (int k = 0; k < v1.length; k++) - result += v1[k] * v2[k]; - - return result; - } - - @DataProvider(name = "BandPassComposition") - public Object[][] makeBandPassComposition() { - final List tests = new LinkedList(); - - for ( int bandPassSize : Arrays.asList(0, 1, 10, 100, BandPassActivityProfile.MAX_FILTER_SIZE) ) { - for ( int integrationLength : Arrays.asList(1, 10, 100, 1000) ) { - tests.add(new Object[]{ bandPassSize, integrationLength }); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test( enabled = ! DEBUG, dataProvider = "BandPassComposition") - public void testBandPassComposition(final int bandPassSize, final int integrationLength) { - final int start = 1; - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, - ACTIVE_PROB_THRESHOLD, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA); - final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2]; - - // add a buffer so that we can get all of the band pass values - final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); - int pos = start; - int rawProbsOffset = 0; - for ( int i = 0; i < bandPassSize; i++ ) { - final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos++); - final ActivityProfileState state = new ActivityProfileState(loc, 0.0); - profile.add(state); - rawActiveProbs[rawProbsOffset++] = 0.0; - rawActiveProbs[rawActiveProbs.length - rawProbsOffset] = 0.0; - } - - for ( int i = 0; i < integrationLength; i++ ) { - final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, pos++); - profile.add(new ActivityProfileState(nextLoc, 1.0)); - rawActiveProbs[rawProbsOffset++] = 1.0; - - for ( int j = 0; j < profile.size(); j++ ) { - Assert.assertTrue(profile.getStateList().get(j).isActiveProb >= 0.0, "State probability < 0 at " + j); - Assert.assertTrue(profile.getStateList().get(j).isActiveProb <= 1.0 + 1e-3, "State probability > 1 at " + j); - } - } - - final double[] expectedProbs = bandPassInOnePass(profile, rawActiveProbs); - for ( int j = 0; j < profile.size(); j++ ) { - Assert.assertEquals(profile.getStateList().get(j).isActiveProb, expectedProbs[j], "State probability not expected at " + j); - } - } - - // ------------------------------------------------------------------------------------ - // - // Code to test the creation of the kernels - // - // ------------------------------------------------------------------------------------ - - /** - - kernel <- function(sd, pThres) { - raw = dnorm(-80:81, mean=0, sd=sd) - norm = raw / sum(raw) - bad = norm < pThres - paste(norm[! bad], collapse=", ") - } - - print(kernel(0.01, 1e-5)) - print(kernel(1, 1e-5)) - print(kernel(5, 1e-5)) - print(kernel(17, 1e-5)) - - * @return - */ - - @DataProvider(name = "KernelCreation") - public Object[][] makeKernelCreation() { - final List tests = new LinkedList(); - - tests.add(new Object[]{ 0.01, 1000, new double[]{1.0}}); - tests.add(new Object[]{ 1.0, 1000, new double[]{0.0001338302, 0.004431848, 0.053990966, 0.241970723, 0.398942278, 0.241970723, 0.053990966, 0.004431848, 0.0001338302}}); - tests.add(new Object[]{ 1.0, 0, new double[]{1.0}}); - tests.add(new Object[]{ 1.0, 1, new double[]{0.2740686, 0.4518628, 0.2740686}}); - tests.add(new Object[]{ 1.0, 2, new double[]{0.05448868, 0.24420134, 0.40261995, 0.24420134, 0.05448868}}); - tests.add(new Object[]{ 1.0, 1000, new double[]{0.0001338302, 0.004431848, 0.053990966, 0.241970723, 0.398942278, 0.241970723, 0.053990966, 0.004431848, 0.0001338302}}); - tests.add(new Object[]{ 5.0, 1000, new double[]{1.1788613551308e-05, 2.67660451529771e-05, 5.83893851582921e-05, 0.000122380386022754, 0.000246443833694604, 0.000476817640292968, 0.000886369682387602, 0.00158309031659599, 0.00271659384673712, 0.00447890605896858, 0.00709491856924629, 0.0107981933026376, 0.0157900316601788, 0.0221841669358911, 0.029945493127149, 0.0388372109966426, 0.0483941449038287, 0.0579383105522965, 0.0666449205783599, 0.0736540280606647, 0.0782085387950912, 0.0797884560802865, 0.0782085387950912, 0.0736540280606647, 0.0666449205783599, 0.0579383105522965, 0.0483941449038287, 0.0388372109966426, 0.029945493127149, 0.0221841669358911, 0.0157900316601788, 0.0107981933026376, 0.00709491856924629, 0.00447890605896858, 0.00271659384673712, 0.00158309031659599, 0.000886369682387602, 0.000476817640292968, 0.000246443833694604, 0.000122380386022754, 5.83893851582921e-05, 2.67660451529771e-05, 1.1788613551308e-05}}); - tests.add(new Object[]{17.0, 1000, new double[]{1.25162575710745e-05, 1.57001772728555e-05, 1.96260034693739e-05, 2.44487374842009e-05, 3.03513668801384e-05, 3.75489089511911e-05, 4.62928204154855e-05, 5.68757597480354e-05, 6.96366758708924e-05, 8.49661819944029e-05, 0.000103312156275406, 0.000125185491708561, 0.000151165896477646, 0.000181907623161359, 0.000218144981137171, 0.000260697461819069, 0.000310474281706066, 0.000368478124457557, 0.000435807841336874, 0.00051365985048857, 0.000603327960854364, 0.000706201337376934, 0.000823760321812988, 0.000957569829285965, 0.00110927005589186, 0.00128056425833231, 0.00147320340358764, 0.00168896753568649, 0.00192964376796036, 0.00219700088266432, 0.00249276060490197, 0.00281856571330067, 0.00317594525418154, 0.00356627723683793, 0.00399074930220799, 0.00445031797242299, 0.00494566720070898, 0.00547716704583487, 0.00604483338842317, 0.00664828968356621, 0.00728673180099395, 0.00795889703644795, 0.00866303838230695, 0.00939690511889675, 0.0101577307281371, 0.010942229037054, 0.0117465993701676, 0.0125665413280325, 0.0133972796167302, 0.0142335991336574, 0.0150698902735454, 0.0159002041614507, 0.0167183172536454, 0.0175178044808441, 0.0182921198494897, 0.0190346831745763, 0.0197389714002676, 0.020398612780527, 0.0210074820484496, 0.0215597946062309, 0.0220501977225941, 0.022473856734247, 0.0228265343139947, 0.0231046609899767, 0.0233053952756892, 0.0234266719946158, 0.0234672376502799, 0.0234266719946158, 0.0233053952756892, 0.0231046609899767, 0.0228265343139947, 0.022473856734247, 0.0220501977225941, 0.0215597946062309, 0.0210074820484496, 0.020398612780527, 0.0197389714002676, 0.0190346831745763, 0.0182921198494897, 0.0175178044808441, 0.0167183172536454, 0.0159002041614507, 0.0150698902735454, 0.0142335991336574, 0.0133972796167302, 0.0125665413280325, 0.0117465993701676, 0.010942229037054, 0.0101577307281371, 0.00939690511889675, 0.00866303838230695, 0.00795889703644795, 0.00728673180099395, 0.00664828968356621, 0.00604483338842317, 0.00547716704583487, 0.00494566720070898, 0.00445031797242299, 0.00399074930220799, 0.00356627723683793, 0.00317594525418154, 0.00281856571330067, 0.00249276060490197, 0.00219700088266432, 0.00192964376796036, 0.00168896753568649, 0.00147320340358764, 0.00128056425833231, 0.00110927005589186, 0.000957569829285965, 0.000823760321812988, 0.000706201337376934, 0.000603327960854364, 0.00051365985048857, 0.000435807841336874, 0.000368478124457557, 0.000310474281706066, 0.000260697461819069, 0.000218144981137171, 0.000181907623161359, 0.000151165896477646, 0.000125185491708561, 0.000103312156275406, 8.49661819944029e-05, 6.96366758708924e-05, 5.68757597480354e-05, 4.62928204154855e-05, 3.75489089511911e-05, 3.03513668801384e-05, 2.44487374842009e-05, 1.96260034693739e-05, 1.57001772728555e-05, 1.25162575710745e-05}}); - - return tests.toArray(new Object[][]{}); - } - - @Test( enabled = ! DEBUG, dataProvider = "KernelCreation") - public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) { - final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, - maxSize, sigma, true); - - final double[] kernel = profile.getKernel(); - Assert.assertEquals(kernel.length, expectedKernel.length); - for ( int i = 0; i < kernel.length; i++ ) - Assert.assertEquals(kernel[i], expectedKernel[i], 1e-3, "Kernels not equal at " + i); - } - - // ------------------------------------------------------------------------------------ - // - // Large-scale test, reading in 1000G Phase I chr20 calls and making sure that - // the regions returned are the same if you run on the entire profile vs. doing it - // incremental - // - // ------------------------------------------------------------------------------------ - - @DataProvider(name = "VCFProfile") - public Object[][] makeVCFProfile() { - final List tests = new LinkedList(); - - //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 61000}); - //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 100000}); - //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000}); - tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000}); - tests.add(new Object[]{ privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf", "20", 1, 1000000}); - - return tests.toArray(new Object[][]{}); - } - - @Test( dataProvider = "VCFProfile") - public void testVCFProfile(final String path, final String contig, final int start, final int end) throws Exception { - final int extension = 50; - final int minRegionSize = 50; - final int maxRegionSize = 300; - - final File file = new File(path); - final VCFCodec codec = new VCFCodec(); - final Pair> reader = GATKVCFUtils.readAllVCs(file, codec); - - final List incRegions = new ArrayList(); - final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); - final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); - int pos = start; - for ( final VariantContext vc : reader.getSecond() ) { - if ( vc == null ) continue; - while ( pos < vc.getStart() ) { - final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); - //logger.warn("Adding 0.0 at " + loc + " because vc.getStart is " + vc.getStart()); - incProfile.add(new ActivityProfileState(loc, 0.0)); - fullProfile.add(new ActivityProfileState(loc, 0.0)); - pos++; - } - if ( vc.getStart() >= start && vc.getEnd() <= end ) { - final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); - //logger.warn("Adding 1.0 at " + loc); - ActivityProfileState.Type type = ActivityProfileState.Type.NONE; - Number value = null; - if ( vc.isBiallelic() && vc.isIndel() ) { - type = ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS; - value = Math.abs(vc.getIndelLengths().get(0)); - } - final ActivityProfileState state = new ActivityProfileState(loc, 1.0, type, value); - incProfile.add(state); - fullProfile.add(state); - pos++; - } - - incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, false)); - - if ( vc.getStart() > end ) - break; - } - - incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true)); - - final List fullRegions = fullProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true); - assertGoodRegions(fullRegions, start, end, maxRegionSize); - assertGoodRegions(incRegions, start, end, maxRegionSize); - - Assert.assertEquals(incRegions.size(), fullRegions.size(), "incremental and full region sizes aren't the same"); - for ( int i = 0; i < fullRegions.size(); i++ ) { - final ActiveRegion incRegion = incRegions.get(i); - final ActiveRegion fullRegion = fullRegions.get(i); - Assert.assertTrue(incRegion.equalExceptReads(fullRegion), "Full and incremental regions are not equal: full = " + fullRegion + " inc = " + incRegion); - } - } - - private void assertGoodRegions(final List regions, final int start, final int end, final int maxRegionSize) { - int lastPosSeen = start - 1; - for ( int regionI = 0; regionI < regions.size(); regionI++ ) { - final ActiveRegion region = regions.get(regionI); - Assert.assertEquals(region.getLocation().getStart(), lastPosSeen + 1, "discontinuous with previous region. lastPosSeen " + lastPosSeen + " but region is " + region); - Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region is too big: " + region); - lastPosSeen = region.getLocation().getStop(); - - for ( final ActivityProfileState state : region.getSupportingStates() ) { - Assert.assertEquals(state.isActiveProb > ACTIVE_PROB_THRESHOLD, region.isActive(), - "Region is active=" + region.isActive() + " but contains a state " + state + " with prob " - + state.isActiveProb + " not within expected values given threshold for activity of " - + ACTIVE_PROB_THRESHOLD); - } - } - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptorUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptorUnitTest.java deleted file mode 100644 index 1dfffa359..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptorUnitTest.java +++ /dev/null @@ -1,233 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import htsjdk.variant.variantcontext.VariantContext; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import htsjdk.samtools.SAMFileWriter; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.io.stubs.*; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.io.PrintStream; -import java.util.Arrays; -import java.util.Collection; - - -public class ArgumentTypeDescriptorUnitTest extends BaseTest { - - //////////////////////////////////////////////////////////////////// - // This section tests the functionality of the @Output annotation // - //////////////////////////////////////////////////////////////////// - - private class ATDTestCommandLineProgram extends CommandLineProgram { - public int execute() { return 0; } - - @Override - public Collection getArgumentTypeDescriptors() { - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - return Arrays.asList( new SAMFileWriterArgumentTypeDescriptor(engine, System.out), - new OutputStreamArgumentTypeDescriptor(engine, System.out), - new VCFWriterArgumentTypeDescriptor(engine, System.out, null)); - } - - protected abstract class ATDTestOutputArgumentSource { - public abstract Object getOut(); - } - - protected class OutputRequiredSamArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = true) - public SAMFileWriter out; - public Object getOut() { return out; } - } - - protected class OutputRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = true) - public VariantContextWriter out; - public Object getOut() { return out; } - } - - protected class OutputRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = true) - public PrintStream out; - public Object getOut() { return out; } - } - - protected class OutputNotRequiredNoDefaultSamArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) - public SAMFileWriter out; - public Object getOut() { return out; } - } - - protected class OutputNotRequiredNoDefaultVcfArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) - public VariantContextWriter out; - public Object getOut() { return out; } - } - - protected class OutputNotRequiredNoDefaultStreamArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = false, defaultToStdout = false) - public PrintStream out; - public Object getOut() { return out; } - } - - protected class OutputNotRequiredSamArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = false) - public SAMFileWriter out; - public Object getOut() { return out; } - } - - protected class OutputNotRequiredVcfArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = false) - public VariantContextWriter out; - public Object getOut() { return out; } - } - - protected class OutputNotRequiredStreamArgumentSource extends ATDTestOutputArgumentSource { - @Output(shortName="o", doc="output file", required = false) - public PrintStream out; - public Object getOut() { return out; } - } - } - - @DataProvider(name = "OutputProvider") - public Object[][] OutputProvider() { - - ObjectArrayList tests = new ObjectArrayList(); - - final ATDTestCommandLineProgram clp = new ATDTestCommandLineProgram(); - - for ( final Object obj : Arrays.asList(clp.new OutputRequiredSamArgumentSource(), clp.new OutputRequiredVcfArgumentSource(), clp.new OutputRequiredStreamArgumentSource()) ) { - for ( final boolean provided : Arrays.asList(true, false) ) { - tests.add(new Object[]{obj, true, true, provided}); - } - } - - for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredSamArgumentSource(), clp.new OutputNotRequiredVcfArgumentSource(), clp.new OutputNotRequiredStreamArgumentSource()) ) { - for ( final boolean provided : Arrays.asList(true, false) ) { - tests.add(new Object[]{obj, false, true, provided}); - } - } - - for ( final Object obj : Arrays.asList(clp.new OutputNotRequiredNoDefaultSamArgumentSource(), clp.new OutputNotRequiredNoDefaultVcfArgumentSource(), clp.new OutputNotRequiredNoDefaultStreamArgumentSource()) ) { - for ( final boolean provided : Arrays.asList(true, false) ) { - tests.add(new Object[]{obj, false, false, provided}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "OutputProvider") - public void testOutput(final ATDTestCommandLineProgram.ATDTestOutputArgumentSource argumentSource, final boolean required, final boolean hasDefault, final boolean provided) { - - final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); - parser.addArgumentSource(argumentSource.getClass()); - parser.parse(provided ? new String[] {"out", "foo"} : new String[] {}); - - try { - parser.loadArgumentsIntoObject(argumentSource); - - if ( !provided && (required || !hasDefault) ) - Assert.assertEquals(argumentSource.getOut(), null); - else if ( !provided ) - Assert.assertNotEquals(argumentSource.getOut(), null); - else if ( argumentSource.getOut() == null || !(argumentSource.getOut() instanceof SAMFileWriterStub) ) // can't test this one case - Assert.assertEquals(!provided, outputIsStdout(argumentSource.getOut())); - - } catch (Exception e) { - throw new ReviewedGATKException(e.getMessage()); - } - } - - @Test - public void testRodBindingsCollection() { - - final ParsingEngine parser = new ParsingEngine(new ATDTestCommandLineProgram()); - - //A list file containing a single VCF - final File listFile = createTempListFile("oneVCF", privateTestDir + "empty.vcf"); - - try { - Object result = ArgumentTypeDescriptor.getRodBindingsCollection(listFile, - parser, - VariantContext.class, - "variant", - new Tags(), - "variantTest"); - if (!(result instanceof RodBindingCollection)) - throw new ReviewedGATKException("getRodBindingsCollection did not return a RodBindingCollection"); - RodBindingCollection rbc = (RodBindingCollection) result; - - Assert.assertEquals(rbc.getType(), VariantContext.class); - Assert.assertEquals(rbc.getRodBindings().size(), 1); - - } catch (IOException e) { - throw new ReviewedGATKException(e.getMessage(), e); - } - - //The same file, now with an extra blank line - final File listFileWithBlank = createTempListFile("oneVCFwithBlankLine", privateTestDir + "empty.vcf", ""); - try { - Object result = ArgumentTypeDescriptor.getRodBindingsCollection(listFileWithBlank, - parser, - VariantContext.class, - "variant", - new Tags(), - "variantTest"); - if (!(result instanceof RodBindingCollection)) - throw new ReviewedGATKException("getRodBindingsCollection did not return a RodBindingCollection"); - RodBindingCollection rbc = (RodBindingCollection) result; - - Assert.assertEquals(rbc.getType(), VariantContext.class); - Assert.assertEquals(rbc.getRodBindings().size(), 1); - - } catch (IOException e) { - throw new ReviewedGATKException(e.getMessage(), e); - } - } - - private static boolean outputIsStdout(final Object out) { - if ( out == null ) { - return false; - } else if ( out instanceof SAMFileWriterStub ) { - return ((SAMFileWriterStub)out).getOutputStream() != System.out; - } else if ( out instanceof VariantContextWriterStub ) { - return ((VariantContextWriterStub)out).getOutputStream() == System.out; - } else if ( out instanceof OutputStreamStub ) { - return ((OutputStreamStub)out).getOutputStream() == System.out; - } - return false; - } - -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/InvalidArgumentIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/InvalidArgumentIntegrationTest.java deleted file mode 100644 index 8ab2159d6..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/InvalidArgumentIntegrationTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.utils.exceptions.UserException; - -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; - -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 8/31/12 - * Time: 11:03 AM - * To change this template use File | Settings | File Templates. - */ -public class InvalidArgumentIntegrationTest extends WalkerTest { - private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; - - private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { - return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " - + callsB36 + " -F POS,CHROM -R " - + b36KGReference + " -o %s " + flag + " " + arg, - 1, exeption); - - } - - @Test - public void testUnknownReadFilter() { - executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); - } - - @Test - public void testMalformedWalkerArgs() { - executeTest("MalformedWalkerArgs", - new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " - + callsB36 + " -F POS,CHROM -R " - + b36KGReference + " -o %s ", - 1, UserException.MalformedWalkerArgumentsException.class)); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/LoggingIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/LoggingIntegrationTest.java deleted file mode 100644 index d690f681c..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/LoggingIntegrationTest.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.commandline; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; - -import org.apache.commons.lang.StringUtils; -import org.apache.log4j.Level; - -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.MD5DB; -import org.broadinstitute.gatk.utils.MD5Mismatch; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.utils.runtime.*; - -public class LoggingIntegrationTest { - private final MD5DB md5db = new MD5DB(); - - private class LoggingTestProvider extends BaseTest.TestDataProvider { - - private final String baseCmdLine; - - private final Level logLevel; - private final String logFileStr; - public final File argumentOutputFile; - public final File pipedOutputFile; - - private LoggingTestProvider(final Level logLevel, final boolean explicitLogfile) throws IOException { - super(LoggingTestProvider.class); - - // TODO: a better command line that exercises log levels besides INFO - this.baseCmdLine = String.format("java -cp %s %s -T SelectVariants -R %s -V %s -L 1:1000000-2000000 --no_cmdline_in_header", - StringUtils.join(RuntimeUtils.getAbsoluteClassPaths(), File.pathSeparatorChar), - CommandLineGATK.class.getCanonicalName(), BaseTest.b37KGReference, BaseTest.b37_NA12878_OMNI); - - this.logLevel = logLevel; - this.logFileStr = explicitLogfile ? " -log " + BaseTest.createTempFile(logLevel.toString(), "log") : ""; - this.argumentOutputFile = BaseTest.createTempFile(logLevel.toString(), "vcf"); - this.pipedOutputFile = BaseTest.createTempFile(logLevel.toString(), "vcf"); - } - - public final String getCmdLine(boolean redirectStdout) { - String command = String.format("%s -l %s %s", baseCmdLine, logLevel, logFileStr); - return redirectStdout ? command : command + " -o " + argumentOutputFile; - } - - public String toString() { - return String.format("LoggingTestProvider logLevel=%s", logLevel); - } - } - - @DataProvider(name = "LoggingTest") - public Object[][] makeLoggingTestProvider() throws IOException { - for (Boolean explicitLogFile : Arrays.asList(true, false)) { - // TODO: enable other logging levels when tests for those exist - new LoggingTestProvider(Level.DEBUG, explicitLogFile); - } - - return LoggingTestProvider.getTests(LoggingTestProvider.class); - } - - /** - * test that using an output argument produces the same output as stdout - */ - @Test(dataProvider = "LoggingTest") - public void testStdoutEquivalence(final LoggingTestProvider cfg) throws IOException { - - ProcessController pc = ProcessController.getThreadLocal(); - - // output argument - - ProcessSettings ps = new ProcessSettings(cfg.getCmdLine(false).split("\\s+")); - pc.execAndCheck(ps); - String output_argument_md5 = md5db.calculateFileMD5(cfg.argumentOutputFile); - - // pipe to stdout - - ps = new ProcessSettings(cfg.getCmdLine(true).split("\\s+")); - ps.setStdoutSettings(new OutputStreamSettings(cfg.pipedOutputFile)); - pc.execAndCheck(ps); - - MD5DB.MD5Match result = md5db.testFileMD5("LoggingIntegrationTest", "LoggingIntegrationTest", cfg.pipedOutputFile, output_argument_md5, false); - if(result.failed) { - final MD5Mismatch failure = new MD5Mismatch(result.actualMD5, result.expectedMD5, result.diffEngineOutput); - Assert.fail(failure.toString()); - } - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/CryptUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/CryptUtilsUnitTest.java deleted file mode 100644 index c44bfdc03..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/CryptUtilsUnitTest.java +++ /dev/null @@ -1,199 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.crypt; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.SkipException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.security.Key; -import java.security.KeyPair; -import java.security.PrivateKey; -import java.security.PublicKey; -import java.util.Arrays; - -public class CryptUtilsUnitTest extends BaseTest { - - @Test - public void testGenerateValidKeyPairWithDefaultSettings() { - KeyPair keyPair = CryptUtils.generateKeyPair(); - Assert.assertTrue(CryptUtils.keysDecryptEachOther(keyPair.getPrivate(), keyPair.getPublic())); - } - - @DataProvider( name = "InvalidKeyPairSettings" ) - public Object[][] invalidKeyPairSettingsDataProvider() { - return new Object[][] { - { -1, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, - { CryptUtils.DEFAULT_KEY_LENGTH, "Made-up algorithm", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, - { CryptUtils.DEFAULT_KEY_LENGTH, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, "Made-up algorithm"} - }; - } - - @Test( dataProvider = "InvalidKeyPairSettings", expectedExceptions = ReviewedGATKException.class ) - public void testGenerateKeyPairWithInvalidSettings( int keyLength, String encryptionAlgorithm, String randomNumberGenerationAlgorithm ) { - KeyPair keyPair = CryptUtils.generateKeyPair(keyLength, encryptionAlgorithm, randomNumberGenerationAlgorithm); - } - - @Test - public void testGATKMasterKeyPairMutualDecryption() { - if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { - throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", - "testGATKMasterKeyPairMutualDecryption")); - } - - Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKMasterPublicKey())); - } - - @Test - public void testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption() { - if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { - throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", - "testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption")); - } - - Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKDistributedPublicKey())); - } - - @Test - public void testKeyPairWriteThenRead() { - KeyPair keyPair = CryptUtils.generateKeyPair(); - File privateKeyFile = createTempFile("testKeyPairWriteThenRead_private", "key"); - File publicKeyFile = createTempFile("testKeyPairWriteThenRead_public", "key"); - - CryptUtils.writeKeyPair(keyPair, privateKeyFile, publicKeyFile); - - assertKeysAreEqual(keyPair.getPrivate(), CryptUtils.readPrivateKey(privateKeyFile)); - assertKeysAreEqual(keyPair.getPublic(), CryptUtils.readPublicKey(publicKeyFile)); - } - - @Test - public void testPublicKeyWriteThenReadFromFile() { - File keyFile = createTempFile("testPublicKeyWriteThenReadFromFile", "key"); - PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); - - CryptUtils.writeKey(publicKey, keyFile); - - assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(keyFile)); - } - - @Test - public void testPublicKeyWriteThenReadFromStream() throws IOException { - File keyFile = createTempFile("testPublicKeyWriteThenReadFromStream", "key"); - PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); - - CryptUtils.writeKey(publicKey, keyFile); - - assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(new FileInputStream(keyFile))); - } - - @Test - public void testPrivateKeyWriteThenReadFromFile() { - File keyFile = createTempFile("testPrivateKeyWriteThenReadFromFile", "key"); - PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); - - CryptUtils.writeKey(privateKey, keyFile); - - assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(keyFile)); - } - - @Test - public void testPrivateKeyWriteThenReadFromStream() throws IOException { - File keyFile = createTempFile("testPrivateKeyWriteThenReadFromStream", "key"); - PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); - - CryptUtils.writeKey(privateKey, keyFile); - - assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(new FileInputStream(keyFile))); - } - - @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) - public void testReadNonExistentPublicKey() { - File nonExistentFile = new File("jdshgkdfhg.key"); - Assert.assertFalse(nonExistentFile.exists()); - - CryptUtils.readPublicKey(nonExistentFile); - } - - @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) - public void testReadNonExistentPrivateKey() { - File nonExistentFile = new File("jdshgkdfhg.key"); - Assert.assertFalse(nonExistentFile.exists()); - - CryptUtils.readPrivateKey(nonExistentFile); - } - - @Test - public void testDecodePublicKey() { - PublicKey originalKey = CryptUtils.generateKeyPair().getPublic(); - PublicKey decodedKey = CryptUtils.decodePublicKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); - assertKeysAreEqual(originalKey, decodedKey); - } - - @Test - public void testDecodePrivateKey() { - PrivateKey originalKey = CryptUtils.generateKeyPair().getPrivate(); - PrivateKey decodedKey = CryptUtils.decodePrivateKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); - assertKeysAreEqual(originalKey, decodedKey); - } - - @Test - public void testLoadGATKMasterPrivateKey() { - if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { - throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", - "testLoadGATKMasterPrivateKey")); - } - - PrivateKey gatkMasterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); - } - - @Test - public void testLoadGATKMasterPublicKey() { - PublicKey gatkMasterPublicKey = CryptUtils.loadGATKMasterPublicKey(); - } - - @Test - public void testLoadGATKDistributedPublicKey() { - PublicKey gatkDistributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); - } - - private void assertKeysAreEqual( Key originalKey, Key keyFromDisk ) { - Assert.assertTrue(Arrays.equals(originalKey.getEncoded(), keyFromDisk.getEncoded())); - Assert.assertEquals(originalKey.getAlgorithm(), keyFromDisk.getAlgorithm()); - Assert.assertEquals(originalKey.getFormat(), keyFromDisk.getFormat()); - } - - private boolean gatkPrivateKeyExistsButReadPermissionDenied() { - File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); - return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/GATKKeyIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/GATKKeyIntegrationTest.java deleted file mode 100644 index 9cafd61a7..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/GATKKeyIntegrationTest.java +++ /dev/null @@ -1,157 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.crypt; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.broadinstitute.gatk.engine.phonehome.GATKRunReport; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class GATKKeyIntegrationTest extends WalkerTest { - - public static final String BASE_COMMAND = String.format("-T PrintReads -R %s -I %s -o %%s", - publicTestDir + "exampleFASTA.fasta", - publicTestDir + "exampleBAM.bam"); - public static final String MD5_UPON_SUCCESSFUL_RUN = "e7b4a5b62f9d4badef1cd07040011b2b"; - - - private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { - String command = BASE_COMMAND + String.format(" %s %s", etArg, keyArg); - - WalkerTestSpec spec = expectedException != null ? - new WalkerTestSpec(command, 1, expectedException) : - new WalkerTestSpec(command, 1, Arrays.asList(md5)); - - spec.disableImplicitArgs(); // Turn off automatic inclusion of -et/-K args by WalkerTest - executeTest(testName, spec); - } - - @Test - public void testValidKeyNoET() { - runGATKKeyTest("testValidKeyNoET", - "-et " + GATKRunReport.PhoneHomeOption.NO_ET, - "-K " + keysDataLocation + "valid.key", - null, - MD5_UPON_SUCCESSFUL_RUN); - } - - @Test - public void testValidKeyETStdout() { - runGATKKeyTest("testValidKeyETStdout", - "-et " + GATKRunReport.PhoneHomeOption.STDOUT, - "-K " + keysDataLocation + "valid.key", - null, - MD5_UPON_SUCCESSFUL_RUN); - } - - @Test - public void testValidKeyETStandard() { - runGATKKeyTest("testValidKeyETStandard", - "", - "-K " + keysDataLocation + "valid.key", - null, - MD5_UPON_SUCCESSFUL_RUN); - } - - @Test - public void testNoKeyNoET() { - runGATKKeyTest("testNoKeyNoET", - "-et " + GATKRunReport.PhoneHomeOption.NO_ET, - "", - UserException.class, - null); - } - - @Test - public void testNoKeyETStdout() { - runGATKKeyTest("testNoKeyETStdout", - "-et " + GATKRunReport.PhoneHomeOption.STDOUT, - "", - UserException.class, - null); - } - - @Test - public void testNoKeyETStandard() { - runGATKKeyTest("testNoKeyETStandard", - "", - "", - null, - MD5_UPON_SUCCESSFUL_RUN); - } - - @Test - public void testRevokedKey() { - runGATKKeyTest("testRevokedKey", - "-et " + GATKRunReport.PhoneHomeOption.NO_ET, - "-K " + keysDataLocation + "revoked.key", - UserException.KeySignatureVerificationException.class, - null); - } - - @DataProvider(name = "CorruptKeyTestData") - public Object[][] corruptKeyDataProvider() { - return new Object[][] { - { "corrupt_empty.key", UserException.UnreadableKeyException.class }, - { "corrupt_single_byte_file.key", UserException.UnreadableKeyException.class }, - { "corrupt_random_contents.key", UserException.UnreadableKeyException.class }, - { "corrupt_single_byte_deletion.key", UserException.UnreadableKeyException.class }, - { "corrupt_single_byte_insertion.key", UserException.UnreadableKeyException.class }, - { "corrupt_single_byte_change.key", UserException.UnreadableKeyException.class }, - { "corrupt_multi_byte_deletion.key", UserException.UnreadableKeyException.class }, - { "corrupt_multi_byte_insertion.key", UserException.UnreadableKeyException.class }, - { "corrupt_multi_byte_change.key", UserException.UnreadableKeyException.class }, - { "corrupt_bad_isize_field.key", UserException.UnreadableKeyException.class }, - { "corrupt_bad_crc.key", UserException.UnreadableKeyException.class }, - { "corrupt_no_email_address.key", UserException.UnreadableKeyException.class }, - { "corrupt_no_sectional_delimiter.key", UserException.UnreadableKeyException.class }, - { "corrupt_no_signature.key", UserException.UnreadableKeyException.class }, - { "corrupt_bad_signature.key", UserException.KeySignatureVerificationException.class }, - { "corrupt_non_gzipped_valid_key.key", UserException.UnreadableKeyException.class } - }; - } - - @Test(dataProvider = "CorruptKeyTestData") - public void testCorruptKey ( String corruptKeyName, Class expectedException ) { - runGATKKeyTest(String.format("testCorruptKey (%s)", corruptKeyName), - "-et " + GATKRunReport.PhoneHomeOption.NO_ET, - "-K " + keysDataLocation + corruptKeyName, - expectedException, - null); - } - - @Test - public void testCorruptButNonRequiredKey() { - runGATKKeyTest("testCorruptButNonRequiredKey", - "", - "-K " + keysDataLocation + "corrupt_random_contents.key", - null, - MD5_UPON_SUCCESSFUL_RUN); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/GATKKeyUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/GATKKeyUnitTest.java deleted file mode 100644 index 5fd6475ce..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/crypt/GATKKeyUnitTest.java +++ /dev/null @@ -1,129 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.crypt; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.SkipException; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.io.File; -import java.security.KeyPair; -import java.security.PrivateKey; -import java.security.PublicKey; - -public class GATKKeyUnitTest extends BaseTest { - - @Test - public void testCreateGATKKeyUsingMasterKeyPair() { - if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { - throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", - "testCreateGATKKeyUsingMasterKeyPair")); - } - - PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); - PublicKey masterPublicKey = CryptUtils.loadGATKMasterPublicKey(); - - // We should be able to create a valid GATKKey using our master key pair: - GATKKey key = new GATKKey(masterPrivateKey, masterPublicKey, "foo@bar.com"); - Assert.assertTrue(key.isValid()); - } - - @Test - public void testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey() { - if ( gatkPrivateKeyExistsButReadPermissionDenied() ) { - throw new SkipException(String.format("Skipping test %s because we do not have permission to read the GATK private key", - "testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey")); - } - - PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); - PublicKey distributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); - - // We should also be able to create a valid GATKKey using our master private - // key and the public key we distribute with the GATK: - GATKKey key = new GATKKey(masterPrivateKey, distributedPublicKey, "foo@bar.com"); - Assert.assertTrue(key.isValid()); - } - - @Test( expectedExceptions = ReviewedGATKException.class ) - public void testKeyPairMismatch() { - KeyPair firstKeyPair = CryptUtils.generateKeyPair(); - KeyPair secondKeyPair = CryptUtils.generateKeyPair(); - - // Attempting to create a GATK Key with private and public keys that aren't part of the - // same key pair should immediately trigger a validation failure: - GATKKey key = new GATKKey(firstKeyPair.getPrivate(), secondKeyPair.getPublic(), "foo@bar.com"); - } - - @Test( expectedExceptions = ReviewedGATKException.class ) - public void testEncryptionAlgorithmMismatch() { - KeyPair keyPair = CryptUtils.generateKeyPair(CryptUtils.DEFAULT_KEY_LENGTH, "DSA", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); - - // Attempting to use a DSA private key to create an RSA signature should throw an error: - GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), "foo@bar.com", "SHA1withRSA"); - } - - @Test( expectedExceptions = UserException.class ) - public void testInvalidEmailAddress() { - String emailAddressWithNulByte = new String(new byte[] { 0 }); - KeyPair keyPair = CryptUtils.generateKeyPair(); - - // Email addresses cannot contain the NUL byte, since it's used as a sectional delimiter in the key file: - GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), emailAddressWithNulByte); - } - - @Test - public void testCreateGATKKeyFromValidKeyFile() { - GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "valid.key")); - Assert.assertTrue(key.isValid()); - } - - @Test( expectedExceptions = UserException.UnreadableKeyException.class ) - public void testCreateGATKKeyFromCorruptKeyFile() { - GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "corrupt_random_contents.key")); - } - - @Test - public void testCreateGATKKeyFromRevokedKeyFile() { - GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "revoked.key")); - Assert.assertFalse(key.isValid()); - } - - @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) - public void testCreateGATKKeyFromNonExistentFile() { - File nonExistentFile = new File("ghfdkgsdhg.key"); - Assert.assertFalse(nonExistentFile.exists()); - - GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), nonExistentFile); - } - - private boolean gatkPrivateKeyExistsButReadPermissionDenied() { - File gatkPrivateKey = new File(CryptUtils.GATK_MASTER_PRIVATE_KEY_FILE); - return gatkPrivateKey.exists() && ! gatkPrivateKey.canRead(); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalIntegrationTest.java deleted file mode 100644 index 8d8f7d261..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalIntegrationTest.java +++ /dev/null @@ -1,304 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.interval; - -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; - -/** - * Test the GATK core interval parsing mechanism. - */ -public class IntervalIntegrationTest extends WalkerTest { - @Test(enabled = true) - public void testAllImplicitIntervalParsing() { - String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testAllIntervalsImplicit",spec); - } - -// '-L all' is no longer supported -// @Test(enabled = true) -// public void testAllExplicitIntervalParsing() { -// String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; -// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( -// "-T CountLoci" + -// " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + -// " -R " + hg18Reference + -// " -L all" + -// " -o %s", -// 1, // just one output file -// Arrays.asList(md5)); -// executeTest("testAllIntervalsExplicit",spec); -// } - - @Test - public void testUnmappedReadInclusion() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T PrintReads" + - " -I " + validationDataLocation + "MV1994.bam" + - " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + - " -L unmapped" + - " -U", - 0, // two output files - Collections.emptyList()); - - // our base file - File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); - spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("95e98192e5b90cf80eaa87a4ace263da",createTempFileFromBase(baseOutputFile.getAbsolutePath())); - spec.addAuxFile("fadcdf88597b9609c5f2a17f4c6eb455", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); - - executeTest("testUnmappedReadInclusion",spec); - } - - @Test - public void testMixedMappedAndUnmapped() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T PrintReads" + - " -I " + validationDataLocation + "MV1994.bam" + - " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + - " -L Escherichia_coli_K12:4630000-4639675" + - " -L unmapped" + - " -U", - 0, // two output files - Collections.emptyList()); - - // our base file - File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); - spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("3944b5a6bfc06277ed3afb928a20d588",createTempFileFromBase(baseOutputFile.getAbsolutePath())); - spec.addAuxFile("fa90ff91ac0cc689c71a3460a3530b8b", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); - - executeTest("testUnmappedReadInclusion",spec); - } - - - @Test(enabled = false) - public void testUnmappedReadExclusion() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T PrintReads" + - " -I " + validationDataLocation + "MV1994.bam" + - " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + - " -XL unmapped" + - " -U", - 0, // two output files - Collections.emptyList()); - - // our base file - File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); - spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("80887ba488e53dabd9596ff93070ae75",createTempFileFromBase(baseOutputFile.getAbsolutePath())); - spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); - - executeTest("testUnmappedReadExclusion",spec); - } - - @Test(enabled = true) - public void testIntervalParsingFromFile() { - String md5 = "48a24b70a0b376535542b996af517398"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.1.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testIntervalParsingFromFile", spec); - } - - @Test(enabled = true) - public void testIntervalMergingFromFiles() { - String md5 = "9ae0ea9e3c9c6e1b9b6252c8395efdc1"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.1.vcf" + - " -L " + validationDataLocation + "intervalTest.2.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testIntervalMergingFromFiles", spec); - } - - @Test(enabled = true) - public void testIntervalExclusionsFromFiles() { - String md5 = "26ab0db90d72e28ad0ba1e22ee510510"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.1.vcf" + - " -XL " + validationDataLocation + "intervalTest.2.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testIntervalExclusionsFromFiles", spec); - } - - @Test(enabled = true) - public void testMixedIntervalMerging() { - String md5 = "7c5aba41f53293b712fd86d08ed5b36e"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.1.vcf" + - " -L chr1:1677524-1677528", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testMixedIntervalMerging", spec); - } - - @Test(enabled = true) - public void testBed() { - String md5 = "cf4278314ef8e4b996e1b798d8eb92cf"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.bed", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testBed", spec); - } - - @Test(enabled = true) - public void testComplexVCF() { - String md5 = "166d77ac1b46a1ec38aa35ab7e628ab5"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.3.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testComplexVCF", spec); - } - - @Test(enabled = true) - public void testComplexVCFWithPadding() { - String md5 = "649ee93d50739c656e94ec88a32c7ffe"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " --interval_padding 2" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.3.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testComplexVCFWithPadding", spec); - } - - @Test(enabled = true) - public void testMergingWithComplexVCF() { - String md5 = "6d7fce9fee471194aa8b5b6e47267f03"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.1.vcf" + - " -XL " + validationDataLocation + "intervalTest.3.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testMergingWithComplexVCF", spec); - } - - @Test(enabled = true) - public void testEmptyVCF() { - String md5 = "897316929176464ebc9ad085f31e7284"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.empty.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testEmptyVCFWarning", spec); - } - - @Test(enabled = true) - public void testIncludeExcludeIsTheSame() { - String md5 = "897316929176464ebc9ad085f31e7284"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -o %s" + - " -L " + validationDataLocation + "intervalTest.1.vcf" + - " -XL " + validationDataLocation + "intervalTest.1.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testIncludeExcludeIsTheSame", spec); - } - - @Test(enabled = true) - public void testSymbolicAlleles() { - String md5 = "52745056d2fd5904857bbd4984c08098"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "NA12878.chrom1.SLX.SRP000032.2009_06.bam" + - " -R " + b36KGReference + - " -o %s" + - " -L " + privateTestDir + "symbolic_alleles_1.vcf", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testSymbolicAlleles", spec); - } - - @Test - public void testIntersectionOfLexicographicallySortedIntervals() { - final String md5 = "18be9375e5a753f766616a51eb6131f0"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - " -T CountLoci" + - " -I " + privateTestDir + "NA12878.4.snippet.bam" + - " -R " + b37KGReference + - " -L " + privateTestDir + "lexicographicallySortedIntervals.bed" + - " -L 4" + - " -isr INTERSECTION" + - " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testIntersectionOfLexicographicallySortedIntervals", spec); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java deleted file mode 100644 index e9846da21..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java +++ /dev/null @@ -1,1110 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.interval; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.reference.ReferenceSequenceFile; -import htsjdk.samtools.util.Interval; -import htsjdk.samtools.util.IntervalList; -import htsjdk.samtools.SAMFileHeader; -import org.apache.commons.io.FileUtils; -import htsjdk.tribble.Feature; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.IntervalBinding; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.arguments.GATKArgumentCollection; -import org.broadinstitute.gatk.engine.datasources.reference.ReferenceDataSource; -import org.broadinstitute.gatk.utils.GenomeLoc; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.GenomeLocSortedSet; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - -/** - * test out the interval utility methods - */ -public class IntervalUtilsUnitTest extends BaseTest { - // used to seed the genome loc parser with a sequence dictionary - private SAMFileHeader hg18Header; - private GenomeLocParser hg18GenomeLocParser; - private List hg18ReferenceLocs; - private SAMFileHeader hg19Header; - private GenomeLocParser hg19GenomeLocParser; - private List hg19ReferenceLocs; - private List hg19exomeIntervals; - - private List getLocs(String... intervals) { - return getLocs(Arrays.asList(intervals)); - } - - private List getLocs(List intervals) { - if (intervals.size() == 0) - return hg18ReferenceLocs; - List locs = new ArrayList(); - for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return Collections.unmodifiableList(locs); - } - - @BeforeClass - public void init() { - File hg18Ref = new File(BaseTest.hg18Reference); - try { - ReferenceDataSource referenceDataSource = new ReferenceDataSource(hg18Ref); - hg18Header = new SAMFileHeader(); - hg18Header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); - ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg18Ref); - hg18GenomeLocParser = new GenomeLocParser(seq); - hg18ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(hg18Ref,ex); - } - - File hg19Ref = new File(BaseTest.hg19Reference); - try { - ReferenceDataSource referenceDataSource = new ReferenceDataSource(hg19Ref); - hg19Header = new SAMFileHeader(); - hg19Header.setSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()); - ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); - hg19GenomeLocParser = new GenomeLocParser(seq); - hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; - - hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals))); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(hg19Ref,ex); - } - } - - // ------------------------------------------------------------------------------------- - // - // tests to ensure the quality of the interval cuts of the interval cutting functions - // - // ------------------------------------------------------------------------------------- - - private class IntervalSlicingTest extends TestDataProvider { - public int parts; - public double maxAllowableVariance; - - private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { - super(IntervalSlicingTest.class); - this.parts = parts; - this.maxAllowableVariance = maxAllowableVariance; - } - - public String toString() { - return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); - } - } - - @DataProvider(name = "intervalslicingdata") - public Object[][] createTrees() { - new IntervalSlicingTest(1, 0); - new IntervalSlicingTest(2, 1); - new IntervalSlicingTest(5, 1); - new IntervalSlicingTest(10, 1); - new IntervalSlicingTest(67, 1); - new IntervalSlicingTest(100, 1); - new IntervalSlicingTest(500, 1); - new IntervalSlicingTest(1000, 1); - return IntervalSlicingTest.getTests(IntervalSlicingTest.class); - } - - @Test(enabled = true, dataProvider = "intervalslicingdata") - public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { - List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); - - long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); - long idealSplitSize = totalSize / test.parts; - - long sumOfSplitSizes = 0; - int counter = 0; - for ( final List split : splits ) { - long splitSize = IntervalUtils.intervalSize(split); - double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); - //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); - counter++; - sumOfSplitSizes += splitSize; - Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); - } - - Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); - } - - // ------------------------------------------------------------------------------------- - // - // splitLocusIntervals tests - // - // ------------------------------------------------------------------------------------- - - /** large scale tests for many intervals */ - private class SplitLocusIntervalsTest extends TestDataProvider { - final List originalIntervals; - final public int parts; - - private SplitLocusIntervalsTest(final String name, List originalIntervals, final int parts) { - super(SplitLocusIntervalsTest.class, name); - this.parts = parts; - this.originalIntervals = originalIntervals; - } - - public String toString() { - return String.format("%s parts=%d", super.toString(), parts); - } - } - - @DataProvider(name = "IntervalRepartitionTest") - public Object[][] createIntervalRepartitionTest() { - for ( int parts : Arrays.asList(1, 2, 3, 10, 13, 100, 151, 1000, 10000) ) { - //for ( int parts : Arrays.asList(10) ) { - new SplitLocusIntervalsTest("hg19RefLocs", hg19ReferenceLocs, parts); - new SplitLocusIntervalsTest("hg19ExomeLocs", hg19exomeIntervals, parts); - } - - return SplitLocusIntervalsTest.getTests(SplitLocusIntervalsTest.class); - } - - @Test(enabled = true, dataProvider = "IntervalRepartitionTest") - public void testIntervalRepartition(SplitLocusIntervalsTest test) { - List> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts); - Assert.assertEquals(splitByLocus.size(), test.parts, "SplitLocusIntervals failed to generate correct number of intervals"); - List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); - - // test overall size - final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); - final long flatSize = IntervalUtils.intervalSize(flat); - Assert.assertEquals(flatSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); - - // test size of each split - final long ideal = (long)Math.floor(originalSize / (1.0 * test.parts)); - final long maxSize = ideal + (originalSize % test.parts) * test.parts; // no more than N * rounding error in size - for ( final List split : splitByLocus ) { - final long splitSize = IntervalUtils.intervalSize(split); - Assert.assertTrue(splitSize >= ideal && splitSize <= maxSize, - String.format("SplitLocusIntervals interval (start=%s) has size %d outside of bounds ideal=%d, max=%d", - split.get(0), splitSize, ideal, maxSize)); - } - - // test that every base in original is covered once by a base in split by locus intervals - String diff = IntervalUtils.equateIntervals(test.originalIntervals, flat); - Assert.assertNull(diff, diff); - } - - /** small scale tests where the expected cuts are enumerated upfront for testing */ - private class SplitLocusIntervalsSmallTest extends TestDataProvider { - final List original; - final public int parts; - final public int expectedParts; - final List expected; - - private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected) { - this(name, originalIntervals, parts, expected, parts); - } - - private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected, int expectedParts) { - super(SplitLocusIntervalsSmallTest.class, name); - this.parts = parts; - this.expectedParts = expectedParts; - this.original = originalIntervals; - this.expected = expected; - } - - public String toString() { - return String.format("%s parts=%d", super.toString(), parts); - } - } - - @DataProvider(name = "SplitLocusIntervalsSmallTest") - public Object[][] createSplitLocusIntervalsSmallTest() { - GenomeLoc bp01_10 = hg19GenomeLocParser.createGenomeLoc("1", 1, 10); - - GenomeLoc bp1_5 = hg19GenomeLocParser.createGenomeLoc("1", 1, 5); - GenomeLoc bp6_10 = hg19GenomeLocParser.createGenomeLoc("1", 6, 10); - new SplitLocusIntervalsSmallTest("cut into two", Arrays.asList(bp01_10), 2, Arrays.asList(bp1_5, bp6_10)); - - GenomeLoc bp20_30 = hg19GenomeLocParser.createGenomeLoc("1", 20, 30); - new SplitLocusIntervalsSmallTest("two in two", Arrays.asList(bp01_10, bp20_30), 2, Arrays.asList(bp01_10, bp20_30)); - - GenomeLoc bp1_7 = hg19GenomeLocParser.createGenomeLoc("1", 1, 7); - GenomeLoc bp8_10 = hg19GenomeLocParser.createGenomeLoc("1", 8, 10); - GenomeLoc bp20_23 = hg19GenomeLocParser.createGenomeLoc("1", 20, 23); - GenomeLoc bp24_30 = hg19GenomeLocParser.createGenomeLoc("1", 24, 30); - new SplitLocusIntervalsSmallTest("two in three", Arrays.asList(bp01_10, bp20_30), 3, - Arrays.asList(bp1_7, bp8_10, bp20_23, bp24_30)); - - GenomeLoc bp1_2 = hg19GenomeLocParser.createGenomeLoc("1", 1, 2); - GenomeLoc bp1_1 = hg19GenomeLocParser.createGenomeLoc("1", 1, 1); - GenomeLoc bp2_2 = hg19GenomeLocParser.createGenomeLoc("1", 2, 2); - new SplitLocusIntervalsSmallTest("too many pieces", Arrays.asList(bp1_2), 5, Arrays.asList(bp1_1, bp2_2), 2); - - new SplitLocusIntervalsSmallTest("emptyList", Collections.emptyList(), 5, Collections.emptyList(), 0); - - return SplitLocusIntervalsSmallTest.getTests(SplitLocusIntervalsSmallTest.class); - } - - @Test(enabled = true, dataProvider = "SplitLocusIntervalsSmallTest") - public void splitLocusIntervalsSmallTest(SplitLocusIntervalsSmallTest test) { - List> splitByLocus = IntervalUtils.splitLocusIntervals(test.original, test.parts); - Assert.assertEquals(splitByLocus.size(), test.expectedParts, "SplitLocusIntervals failed to generate correct number of intervals"); - List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); - - // test sizes - final long originalSize = IntervalUtils.intervalSize(test.original); - final long splitSize = IntervalUtils.intervalSize(flat); - Assert.assertEquals(splitSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); - - Assert.assertEquals(flat, test.expected, "SplitLocusIntervals locs not expected intervals"); - } - - // - // Misc. tests - // - - @Test(expectedExceptions=UserException.class) - public void testMergeListsBySetOperatorNoOverlap() { - // a couple of lists we'll use for the testing - List listEveryTwoFromOne = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - else - listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - - List ret; - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 100); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, null); - Assert.assertEquals(ret.size(), 100); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 0); - } - - @Test - public void testMergeListsBySetOperatorAllOverlap() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 2 == 0) - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - - List ret; - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 150); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); - Assert.assertEquals(ret.size(), 150); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 50); - } - - @Test - public void testMergeListsBySetOperator() { - // a couple of lists we'll use for the testing - List allSites = new ArrayList(); - List listEveryTwoFromTwo = new ArrayList(); - - // create the two lists we'll use - for (int x = 1; x < 101; x++) { - if (x % 5 == 0) { - listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); - } - } - - List ret; - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); - Assert.assertEquals(ret.size(), 40); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); - Assert.assertEquals(ret.size(), 40); - ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 20); - } - - @Test - public void testOverlappingIntervalsFromSameSourceWithIntersection() { - // a couple of lists we'll use for the testing - List source1 = new ArrayList(); - List source2 = new ArrayList(); - - source1.add(hg18GenomeLocParser.createGenomeLoc("chr1", 10, 20)); - source1.add(hg18GenomeLocParser.createGenomeLoc("chr1", 15, 25)); - - source2.add(hg18GenomeLocParser.createGenomeLoc("chr1", 16, 18)); - source2.add(hg18GenomeLocParser.createGenomeLoc("chr1", 22, 24)); - - List ret = IntervalUtils.mergeListsBySetOperator(source1, source2, IntervalSetRule.INTERSECTION); - Assert.assertEquals(ret.size(), 2); - } - - @Test - public void testGetContigLengths() { - Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); - Assert.assertEquals((long)lengths.get("chr1"), 247249719); - Assert.assertEquals((long)lengths.get("chr2"), 242951149); - Assert.assertEquals((long)lengths.get("chr3"), 199501827); - Assert.assertEquals((long)lengths.get("chr20"), 62435964); - Assert.assertEquals((long)lengths.get("chrX"), 154913754); - } - - @Test - public void testParseIntervalArguments() { - Assert.assertEquals(getLocs().size(), 45); - Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); - Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); - } - - @Test - public void testIsIntervalFile() { - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.privateTestDir + "empty_intervals.list")); - Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.privateTestDir + "empty_intervals.list", true)); - - List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); - for (String extension: extensions) { - Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); - } - } - - @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) - public void testMissingIntervalFile() { - IntervalUtils.isIntervalFile(BaseTest.privateTestDir + "no_such_intervals.list"); - } - - @Test - public void testFixedScatterIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - - List files = testFiles("basic.", 3, ".intervals"); - - List locs = getLocs("chr1", "chr2", "chr3"); - List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterFixedIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("less.", 3, ".intervals"); - - List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - Assert.assertEquals(locs3.get(1), chr4); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testSplitFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - IntervalUtils.splitFixedIntervals(locs, files.size()); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testScatterFixedIntervalsMoreFiles() { - List files = testFiles("more.", 3, ".intervals"); - List locs = getLocs("chr1", "chr2"); - List> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() - IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - } - @Test - public void testScatterFixedIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs2.get(0), chr1b); - Assert.assertEquals(locs3.get(0), chr2); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs3.get(0), chr2b); - Assert.assertEquals(locs3.get(1), chr3); - } - - @Test - public void testScatterFixedIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("split.", 3, ".intervals"); - - List locs = getLocs(intervals); - List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3a); - Assert.assertEquals(locs3.get(0), chr3b); - } - - @Test - public void testScatterFixedIntervalsFile() { - List files = testFiles("sg.", 20, ".intervals"); - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list")); - List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - - int[] counts = { - 125, 138, 287, 291, 312, 105, 155, 324, - 295, 298, 141, 121, 285, 302, 282, 88, - 116, 274, 282, 248 -// 5169, 5573, 10017, 10567, 10551, -// 5087, 4908, 10120, 10435, 10399, -// 5391, 4735, 10621, 10352, 10654, -// 5227, 5256, 10151, 9649, 9825 - }; - - //String splitCounts = ""; - for (int i = 0; i < splits.size(); i++) { - int splitCount = splits.get(i).size(); - Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); - } - //System.out.println(splitCounts.substring(2)); - - IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - - int locIndex = 0; - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file)); - Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); - for (GenomeLoc parsedLoc: parsedLocs) - Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); - } - Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); - } - - @Test - public void testScatterFixedIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - List> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); - IntervalUtils.scatterFixedIntervals(hg19Header, splits, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file)); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - @Test - public void testScatterContigIntervalsOrder() { - List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("split.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr2); - Assert.assertEquals(locs2.get(0), chr1); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsBasic() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - - List files = testFiles("contig_basic.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsLessFiles() { - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); - GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); - - List files = testFiles("contig_less.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs1.get(1), chr2); - Assert.assertEquals(locs2.get(0), chr3); - Assert.assertEquals(locs3.get(0), chr4); - } - - @Test(expectedExceptions=UserException.BadInput.class) - public void testScatterContigIntervalsMoreFiles() { - List files = testFiles("contig_more.", 3, ".intervals"); - IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); - } - - @Test - public void testScatterContigIntervalsStart() { - List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); - GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); - GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_start.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 2); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1a); - Assert.assertEquals(locs1.get(1), chr1b); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsMiddle() { - List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); - GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); - GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); - - List files = testFiles("contig_split_middle.", 3, ".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 2); - Assert.assertEquals(locs3.size(), 1); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2a); - Assert.assertEquals(locs2.get(1), chr2b); - Assert.assertEquals(locs3.get(0), chr3); - } - - @Test - public void testScatterContigIntervalsEnd() { - List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); - GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); - GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); - GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); - GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); - - List files = testFiles("contig_split_end.", 3 ,".intervals"); - - IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); - - Assert.assertEquals(locs1.size(), 1); - Assert.assertEquals(locs2.size(), 1); - Assert.assertEquals(locs3.size(), 2); - - Assert.assertEquals(locs1.get(0), chr1); - Assert.assertEquals(locs2.get(0), chr2); - Assert.assertEquals(locs3.get(0), chr3a); - Assert.assertEquals(locs3.get(1), chr3b); - } - - @Test - public void testScatterContigIntervalsMax() { - List files = testFiles("sg.", 85, ".intervals"); - IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); - - for (int i = 0; i < files.size(); i++) { - String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file)); - Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); - Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); - } - } - - private List testFiles(String prefix, int count, String suffix) { - ArrayList files = new ArrayList(); - for (int i = 1; i <= count; i++) { - files.add(createTempFile(prefix + i, suffix)); - } - return files; - } - - @DataProvider(name="unmergedIntervals") - public Object[][] getUnmergedIntervals() { - return new Object[][] { - new Object[] {"small_unmerged_picard_intervals.list"}, - new Object[] {"small_unmerged_gatk_intervals.list"} - }; - } - - @Test(dataProvider="unmergedIntervals") - public void testUnmergedIntervals(String unmergedIntervals) { - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(privateTestDir + unmergedIntervals)); - Assert.assertEquals(locs.size(), 2); - - List merged; - - merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); - Assert.assertEquals(merged.size(), 1); - - // Test that null means the same as ALL - merged = IntervalUtils.mergeIntervalLocations(locs, null); - Assert.assertEquals(merged.size(), 1); - } - - /* - Split into tests that can be written to files and tested by writeFlankingIntervals, - and lists that cannot but are still handled by getFlankingIntervals. - */ - private static abstract class FlankingIntervalsTestData extends TestDataProvider { - final public File referenceFile; - final public GenomeLocParser parser; - final int basePairs; - final List original; - final List expected; - - protected FlankingIntervalsTestData(Class clazz, String name, File referenceFile, GenomeLocParser parser, - int basePairs, List original, List expected) { - super(clazz, name); - this.referenceFile = referenceFile; - this.parser = parser; - this.basePairs = basePairs; - this.original = parse(parser, original); - this.expected = parse(parser, expected); - } - - private static List parse(GenomeLocParser parser, List locs) { - List parsed = new ArrayList(); - for (String loc: locs) - parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc)); - return parsed; - } - } - - private static class FlankingIntervalsFile extends FlankingIntervalsTestData { - public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser, - int basePairs, List original, List expected) { - super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected); - } - } - - private static class FlankingIntervalsList extends FlankingIntervalsTestData { - public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser, - int basePairs, List original, List expected) { - super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected); - } - } - - /* Intervals where the original and the flanks can be written to files. */ - @DataProvider(name = "flankingIntervalsFiles") - public Object[][] getFlankingIntervalsFiles() { - File hg19ReferenceFile = new File(BaseTest.hg19Reference); - int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength(); - - new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, - Arrays.asList("1:1"), - Arrays.asList("1:2")); - - new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:1"), - Arrays.asList("1:2-51")); - - new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:1-10"), - Arrays.asList("1:11-60")); - - new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, - Arrays.asList("1:" + hg19Length1), - Arrays.asList("1:" + (hg19Length1 - 1))); - - new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:" + hg19Length1), - Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1))); - - new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)), - Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11))); - - new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, - Arrays.asList("1:2"), - Arrays.asList("1:1", "1:3")); - - new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:21-30"), - Arrays.asList("1:1-20", "1:31-80")); - - new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, - Arrays.asList("1:" + (hg19Length1 - 1)), - Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1)); - - new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)), - Arrays.asList( - String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31), - String.format("1:%d-%d", hg19Length1 - 20, hg19Length1))); - - new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, - Arrays.asList("1:3"), - Arrays.asList("1:2", "1:4")); - - new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200"), - Arrays.asList("1:51-100", "1:201-250")); - - new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, - Arrays.asList("1:" + (hg19Length1 - 3)), - Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2))); - - new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)), - Arrays.asList( - String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201), - String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51))); - - new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200", "1:401-500"), - Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550")); - - new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200", "1:301-400"), - Arrays.asList("1:51-100", "1:201-300", "1:401-450")); - - new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200", "1:271-400"), - Arrays.asList("1:51-100", "1:201-270", "1:401-450")); - - new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200", "1:221-400"), - Arrays.asList("1:51-100", "1:201-220", "1:401-450")); - - new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200", "1:121-400"), - Arrays.asList("1:51-100", "1:401-450")); - - new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200", "2:301-400"), - Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450")); - - // Explicit testing a problematic agilent target pair - new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("2:74756257-74756411", "2:74756487-74756628"), - // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679") - Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678")); - - return TestDataProvider.getTests(FlankingIntervalsFile.class); - } - - /* Intervals where either the original and/or the flanks cannot be written to a file. */ - @DataProvider(name = "flankingIntervalsLists") - public Object[][] getFlankingIntervalsLists() { - File hg19ReferenceFile = new File(BaseTest.hg19Reference); - List empty = Collections.emptyList(); - - new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50, - empty, - empty); - - new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("unmapped"), - empty); - - new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1"), - empty); - - new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1", "2", "3"), - empty); - - new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, - Arrays.asList("1:101-200", "1:301-400", "unmapped"), - Arrays.asList("1:51-100", "1:201-300", "1:401-450")); - - return TestDataProvider.getTests(FlankingIntervalsList.class); - } - - @Test(dataProvider = "flankingIntervalsFiles") - public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception { - File originalFile = createTempFile("original.", ".intervals"); - File flankingFile = createTempFile("flanking.", ".intervals"); - try { - List lines = new ArrayList(); - for (GenomeLoc loc: data.original) - lines.add(loc.toString()); - FileUtils.writeLines(originalFile, lines); - - IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); - - List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath()); - - String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", - data.toString(), data.original, actual, data.expected); - Assert.assertEquals(actual, data.expected, description); - } finally { - FileUtils.deleteQuietly(originalFile); - FileUtils.deleteQuietly(flankingFile); - } - } - - @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class) - public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception { - File originalFile = createTempFile("original.", ".intervals"); - File flankingFile = createTempFile("flanking.", ".intervals"); - try { - List lines = new ArrayList(); - for (GenomeLoc loc: data.original) - lines.add(loc.toString()); - FileUtils.writeLines(originalFile, lines); - - // Should throw a user exception on bad input if either the original - // intervals are empty or if the flanking intervals are empty - IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); - } finally { - FileUtils.deleteQuietly(originalFile); - FileUtils.deleteQuietly(flankingFile); - } - } - - @Test(dataProvider = "flankingIntervalsLists") - public void testGetFlankingIntervals(FlankingIntervalsTestData data) { - List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs); - String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", - data.toString(), data.original, actual, data.expected); - Assert.assertEquals(actual, data.expected, description); - } - - @Test(expectedExceptions=UserException.BadArgumentValue.class) - public void testExceptionUponLegacyIntervalSyntax() throws Exception { - GenomeAnalysisEngine toolkit = new GenomeAnalysisEngine(); - toolkit.setGenomeLocParser(new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)))); - - // Attempting to use the legacy -L "interval1;interval2" syntax should produce an exception: - IntervalBinding binding = new IntervalBinding("1;2"); - binding.getIntervals(toolkit); - } - - @DataProvider(name="invalidIntervalTestData") - public Object[][] invalidIntervalDataProvider() throws Exception { - GATKArgumentCollection argCollection = new GATKArgumentCollection(); - File fastaFile = new File(publicTestDir + "exampleFASTA.fasta"); - GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); - - return new Object[][] { - new Object[] {argCollection, genomeLocParser, "chr1", 10000000, 20000000}, - new Object[] {argCollection, genomeLocParser, "chr2", 1, 2}, - new Object[] {argCollection, genomeLocParser, "chr1", -1, 50} - }; - } - - @Test(dataProvider="invalidIntervalTestData") - public void testInvalidPicardIntervalHandling(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - - SAMFileHeader picardFileHeader = new SAMFileHeader(); - picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); - IntervalList picardIntervals = new IntervalList(picardFileHeader); - picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); - - File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); - picardIntervals.write(picardIntervalFile); - - List> intervalArgs = new ArrayList>(1); - intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); - - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); - } - - @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") - public void testInvalidGATKFileIntervalHandling(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - - File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", - String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); - - List> intervalArgs = new ArrayList>(1); - intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); - - IntervalUtils.loadIntervals(intervalArgs, argCollection.intervalArguments.intervalSetRule, argCollection.intervalArguments.intervalMerging, argCollection.intervalArguments.intervalPadding, genomeLocParser); - } - - private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { - File tempFile = BaseTest.createTempFile(tempFilePrefix, tempFileExtension); - FileUtils.writeLines(tempFile, Arrays.asList(lines)); - return tempFile; - } - - @DataProvider(name = "sortAndMergeIntervals") - public Object[][] getSortAndMergeIntervals() { - return new Object[][] { - new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1", "chr1:2", "chr1:3") }, - new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1-3") }, - new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, - new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, - new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1"), getLocs("chr1") }, - new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1"), getLocs("chr1") } - }; - } - - @Test(dataProvider = "sortAndMergeIntervals") - public void testSortAndMergeIntervals(IntervalMergingRule merge, List unsorted, List expected) { - List sorted = IntervalUtils.sortAndMergeIntervals(hg18GenomeLocParser, unsorted, merge).toList(); - Assert.assertEquals(sorted, expected); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/io/IOUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/io/IOUtilsUnitTest.java deleted file mode 100644 index 13a2e8a1d..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/io/IOUtilsUnitTest.java +++ /dev/null @@ -1,326 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.io; - -import org.apache.commons.io.FileUtils; -import org.broadinstitute.gatk.utils.BaseTest; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -public class IOUtilsUnitTest extends BaseTest { - @Test - public void testGoodTempDir() { - IOUtils.checkTempDir(new File("/tmp/queue")); - } - - @Test(expectedExceptions=UserException.BadTmpDir.class) - public void testBadTempDir() { - IOUtils.checkTempDir(new File("/tmp")); - } - - @Test - public void testAbsoluteSubDir() { - File subDir = IOUtils.absolute(new File("."), new File("/path/to/file")); - Assert.assertEquals(subDir, new File("/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/path"), new File("/path/to/file")); - Assert.assertEquals(subDir, new File("/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/path"), new File(".")); - Assert.assertEquals(subDir, new File("/different/path")); - } - - @Test - public void testRelativeSubDir() throws IOException { - File subDir = IOUtils.absolute(new File("."), new File("path/to/file")); - Assert.assertEquals(subDir.getCanonicalFile(), new File("path/to/file").getCanonicalFile()); - - subDir = IOUtils.absolute(new File("/different/path"), new File("path/to/file")); - Assert.assertEquals(subDir, new File("/different/path/path/to/file")); - } - - @Test - public void testDottedSubDir() throws IOException { - File subDir = IOUtils.absolute(new File("."), new File("path/../to/file")); - Assert.assertEquals(subDir.getCanonicalFile(), new File("path/../to/./file").getCanonicalFile()); - - subDir = IOUtils.absolute(new File("."), new File("/path/../to/file")); - Assert.assertEquals(subDir, new File("/path/../to/file")); - - subDir = IOUtils.absolute(new File("/different/../path"), new File("path/to/file")); - Assert.assertEquals(subDir, new File("/different/../path/path/to/file")); - - subDir = IOUtils.absolute(new File("/different/./path"), new File("/path/../to/file")); - Assert.assertEquals(subDir, new File("/path/../to/file")); - } - - @Test - public void testTempDir() { - File tempDir = IOUtils.tempDir("Q-Unit-Test", "", new File("queueTempDirToDelete")); - Assert.assertTrue(tempDir.exists()); - Assert.assertFalse(tempDir.isFile()); - Assert.assertTrue(tempDir.isDirectory()); - boolean deleted = IOUtils.tryDelete(tempDir); - Assert.assertTrue(deleted); - Assert.assertFalse(tempDir.exists()); - } - - @Test - public void testDirLevel() { - File dir = IOUtils.dirLevel(new File("/path/to/directory"), 1); - Assert.assertEquals(dir, new File("/path")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 2); - Assert.assertEquals(dir, new File("/path/to")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 3); - Assert.assertEquals(dir, new File("/path/to/directory")); - - dir = IOUtils.dirLevel(new File("/path/to/directory"), 4); - Assert.assertEquals(dir, new File("/path/to/directory")); - } - - @Test - public void testAbsolute() { - File dir = IOUtils.absolute(new File("/path/./to/./directory/.")); - Assert.assertEquals(dir, new File("/path/to/directory")); - - dir = IOUtils.absolute(new File("/")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/.")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/././.")); - Assert.assertEquals(dir, new File("/")); - - dir = IOUtils.absolute(new File("/./directory/.")); - Assert.assertEquals(dir, new File("/directory")); - - dir = IOUtils.absolute(new File("/./directory/./")); - Assert.assertEquals(dir, new File("/directory")); - - dir = IOUtils.absolute(new File("/./directory./")); - Assert.assertEquals(dir, new File("/directory.")); - - dir = IOUtils.absolute(new File("/./.directory/")); - Assert.assertEquals(dir, new File("/.directory")); - } - - @Test - public void testTail() throws IOException { - List lines = Arrays.asList( - "chr18_random 4262 3154410390 50 51", - "chr19_random 301858 3154414752 50 51", - "chr21_random 1679693 3154722662 50 51", - "chr22_random 257318 3156435963 50 51", - "chrX_random 1719168 3156698441 50 51"); - List tail = IOUtils.tail(new File(BaseTest.hg18Reference + ".fai"), 5); - Assert.assertEquals(tail.size(), 5); - for (int i = 0; i < 5; i++) - Assert.assertEquals(tail.get(i), lines.get(i)); - } - - @Test - public void testWriteSystemFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("testProperties.properties", null), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteSystemTempFile() throws IOException { - File temp = IOUtils.writeTempResource(new Resource("testProperties.properties", null)); - try { - Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); - Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testMissingSystemFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("MissingStingText.properties", null), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteRelativeFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - IOUtils.writeResource(new Resource("/testProperties.properties", IOUtils.class), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testWriteRelativeTempFile() throws IOException { - File temp = IOUtils.writeTempResource(new Resource("/testProperties.properties", IOUtils.class)); - try { - Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); - Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testMissingRelativeFile() throws IOException { - File temp = createTempFile("temp.", ".properties"); - try { - // Looking for /org/broadinstitute/gatk/utils/file/GATKText.properties - IOUtils.writeResource(new Resource("GATKText.properties", IOUtils.class), temp); - } finally { - FileUtils.deleteQuietly(temp); - } - } - - @Test - public void testResourceProperties() { - Resource resource = new Resource("foo", Resource.class); - Assert.assertEquals(resource.getPath(), "foo"); - Assert.assertEquals(resource.getRelativeClass(), Resource.class); - } - - @Test - public void testIsSpecialFile() { - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); - Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); - Assert.assertFalse(IOUtils.isSpecialFile(null)); - Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); - Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); - } - - @DataProvider( name = "ByteArrayIOTestData") - public Object[][] byteArrayIOTestDataProvider() { - return new Object[][] { - // file size, read buffer size - { 0, 4096 }, - { 1, 4096 }, - { 2000, 4096 }, - { 4095, 4096 }, - { 4096, 4096 }, - { 4097, 4096 }, - { 6000, 4096 }, - { 8191, 4096 }, - { 8192, 4096 }, - { 8193, 4096 }, - { 10000, 4096 } - }; - } - - @Test( dataProvider = "ByteArrayIOTestData" ) - public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { - File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); - - byte[] dataWritten = getDeterministicRandomData(fileSize); - IOUtils.writeByteArrayToFile(dataWritten, tempFile); - byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); - - Assert.assertEquals(dataRead.length, dataWritten.length); - Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); - } - - @Test( dataProvider = "ByteArrayIOTestData" ) - public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { - File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); - - byte[] dataWritten = getDeterministicRandomData(fileSize); - IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); - byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); - - Assert.assertEquals(dataRead.length, dataWritten.length); - Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); - } - - @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) - public void testReadNonExistentFileIntoByteArray() { - File nonExistentFile = new File("djfhsdkjghdfk"); - Assert.assertFalse(nonExistentFile.exists()); - - IOUtils.readFileIntoByteArray(nonExistentFile); - } - - @Test( expectedExceptions = ReviewedGATKException.class ) - public void testReadNullStreamIntoByteArray() { - IOUtils.readStreamIntoByteArray(null); - } - - @Test( expectedExceptions = ReviewedGATKException.class ) - public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { - IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), - -1); - } - - @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) - public void testWriteByteArrayToUncreatableFile() { - IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); - } - - @Test( expectedExceptions = ReviewedGATKException.class ) - public void testWriteNullByteArrayToFile() { - IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); - } - - @Test( expectedExceptions = ReviewedGATKException.class ) - public void testWriteByteArrayToNullStream() { - IOUtils.writeByteArrayToStream(new byte[]{0}, null); - } - - private byte[] getDeterministicRandomData ( int size ) { - GenomeAnalysisEngine.resetRandomGenerator(); - Random rand = GenomeAnalysisEngine.getRandomGenerator(); - - byte[] randomData = new byte[size]; - rand.nextBytes(randomData); - - return randomData; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorBenchmark.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorBenchmark.java deleted file mode 100644 index 1f02a68e5..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorBenchmark.java +++ /dev/null @@ -1,142 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import com.google.caliper.Param; -import com.google.caliper.SimpleBenchmark; -import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; - -import java.util.LinkedList; -import java.util.List; - -/** - * Caliper microbenchmark of fragment pileup - */ -public class LocusIteratorBenchmark extends SimpleBenchmark { - protected SAMFileHeader header; - protected GenomeLocParser genomeLocParser; - - List reads = new LinkedList(); - final int readLength = 101; - final int nReads = 10000; - final int locus = 1; - - @Param({"101M", "50M10I40M", "50M10D40M"}) - String cigar; // set automatically by framework - - @Override protected void setUp() { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - - for ( int j = 0; j < nReads; j++ ) { - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - final byte[] quals = new byte[readLength]; - for ( int i = 0; i < readLength; i++ ) - quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); - read.setBaseQualities(quals); - read.setCigarString(cigar); - reads.add(read); - } - } - -// public void timeOriginalLIBS(int rep) { -// for ( int i = 0; i < rep; i++ ) { -// final org.broadinstitute.gatk.utils.locusiterator.old.LocusIteratorByState libs = -// new org.broadinstitute.gatk.utils.locusiterator.old.LocusIteratorByState( -// new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), -// LocusIteratorByStateBaseTest.createTestReadProperties(), -// genomeLocParser, -// LocusIteratorByState.sampleListForSAMWithoutReadGroups()); -// -// while ( libs.hasNext() ) { -// AlignmentContext context = libs.next(); -// } -// } -// } -// -// public void timeLegacyLIBS(int rep) { -// for ( int i = 0; i < rep; i++ ) { -// final org.broadinstitute.gatk.utils.locusiterator.legacy.LegacyLocusIteratorByState libs = -// new org.broadinstitute.gatk.utils.locusiterator.legacy.LegacyLocusIteratorByState( -// new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), -// LocusIteratorByStateBaseTest.createTestReadProperties(), -// genomeLocParser, -// LocusIteratorByState.sampleListForSAMWithoutReadGroups()); -// -// while ( libs.hasNext() ) { -// AlignmentContext context = libs.next(); -// } -// } -// } - - public void timeNewLIBS(int rep) { - for ( int i = 0; i < rep; i++ ) { - final org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState libs = - new org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState( - new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), - LocusIteratorByStateBaseTest.createTestReadProperties(), - genomeLocParser, - LocusIteratorByState.sampleListForSAMWithoutReadGroups()); - - while ( libs.hasNext() ) { - AlignmentContext context = libs.next(); - } - } - } - -// public void timeOriginalLIBSStateMachine(int rep) { -// for ( int i = 0; i < rep; i++ ) { -// for ( final SAMRecord read : reads ) { -// final SAMRecordAlignmentState alignmentStateMachine = new SAMRecordAlignmentState(read); -// while ( alignmentStateMachine.stepForwardOnGenome() != null ) { -// alignmentStateMachine.getGenomeOffset(); -// } -// } -// } -// } - - public void timeAlignmentStateMachine(int rep) { - for ( int i = 0; i < rep; i++ ) { - for ( final GATKSAMRecord read : reads ) { - final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read); - while ( alignmentStateMachine.stepForwardOnGenome() != null ) { - ; - } - } - } - } - - public static void main(String[] args) { - com.google.caliper.Runner.main(LocusIteratorBenchmark.class, args); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateBaseTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateBaseTest.java deleted file mode 100644 index 286c7120c..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateBaseTest.java +++ /dev/null @@ -1,252 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import htsjdk.samtools.*; -import htsjdk.samtools.util.CloseableIterator; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.arguments.ValidationExclusion; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.engine.filters.ReadFilter; -import org.broadinstitute.gatk.engine.iterators.ReadTransformer; -import org.broadinstitute.gatk.utils.GenomeLocParser; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; - -import java.util.*; - -/** - * testing of the new (non-legacy) version of LocusIteratorByState - */ -public class LocusIteratorByStateBaseTest extends BaseTest { - protected static SAMFileHeader header; - protected GenomeLocParser genomeLocParser; - - @BeforeClass - public void beforeClass() { - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - } - - protected LocusIteratorByState makeLTBS(List reads, - ReadProperties readAttributes) { - return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), - readAttributes, - genomeLocParser, - LocusIteratorByState.sampleListForSAMWithoutReadGroups()); - } - - public static ReadProperties createTestReadProperties() { - return createTestReadProperties(null, false); - } - - public static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod, final boolean keepReads ) { - return new ReadProperties( - Collections.emptyList(), - new SAMFileHeader(), - SAMFileHeader.SortOrder.coordinate, - false, - ValidationStringency.STRICT, - downsamplingMethod, - new ValidationExclusion(), - Collections.emptyList(), - Collections.emptyList(), - true, - (byte) -1, - keepReads); - } - - public static class FakeCloseableIterator implements CloseableIterator { - Iterator iterator; - - public FakeCloseableIterator(Iterator it) { - iterator = it; - } - - @Override - public void close() {} - - @Override - public boolean hasNext() { - return iterator.hasNext(); - } - - @Override - public T next() { - return iterator.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException("Don't remove!"); - } - } - - protected static class LIBSTest { - public static final int locus = 44367788; - final String cigarString; - final int readLength; - final private List elements; - - public LIBSTest(final String cigarString) { - final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); - this.cigarString = cigarString; - this.elements = cigar.getCigarElements(); - this.readLength = cigar.getReadLength(); - } - - @Override - public String toString() { - return "LIBSTest{" + - "cigar='" + cigarString + '\'' + - ", readLength=" + readLength + - '}'; - } - - public List getElements() { - return elements; - } - - public GATKSAMRecord makeRead() { - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - final byte[] quals = new byte[readLength]; - for ( int i = 0; i < readLength; i++ ) - quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); - read.setBaseQualities(quals); - read.setCigarString(cigarString); - return read; - } - } - - private boolean isIndel(final CigarElement ce) { - return ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I; - } - - private boolean startsWithDeletion(final List elements) { - for ( final CigarElement element : elements ) { - switch ( element.getOperator() ) { - case M: - case I: - case EQ: - case X: - return false; - case D: - return true; - default: - // keep looking - } - } - - return false; - } - - private LIBSTest makePermutationTest(final List elements) { - CigarElement last = null; - boolean hasMatch = false; - - // starts with D => bad - if ( startsWithDeletion(elements) ) - return null; - - // ends with D => bad - if ( elements.get(elements.size()-1).getOperator() == CigarOperator.D ) - return null; - - // make sure it's valid - String cigar = ""; - int len = 0; - for ( final CigarElement ce : elements ) { - if ( ce.getOperator() == CigarOperator.N ) - return null; // TODO -- don't support N - - // abort on a bad cigar - if ( last != null ) { - if ( ce.getOperator() == last.getOperator() ) - return null; - if ( isIndel(ce) && isIndel(last) ) - return null; - } - - cigar += ce.getLength() + ce.getOperator().toString(); - len += ce.getLength(); - last = ce; - hasMatch = hasMatch || ce.getOperator() == CigarOperator.M; - } - - if ( ! hasMatch && elements.size() == 1 && - ! (last.getOperator() == CigarOperator.I || last.getOperator() == CigarOperator.S)) - return null; - - return new LIBSTest(cigar); - } - - @DataProvider(name = "LIBSTest") - public Object[][] createLIBSTests(final List cigarLengths, final List combinations) { - final List tests = new LinkedList(); - - final List allOps = Arrays.asList(CigarOperator.values()); - - final List singleCigars = new LinkedList(); - for ( final int len : cigarLengths ) - for ( final CigarOperator op : allOps ) - singleCigars.add(new CigarElement(len, op)); - - for ( final int complexity : combinations ) { - for ( final List elements : Utils.makePermutations(singleCigars, complexity, true) ) { - final LIBSTest test = makePermutationTest(elements); - if ( test != null ) tests.add(new Object[]{test}); - } - } - - return tests.toArray(new Object[][]{}); - } - - /** - * Work around inadequate tests that aren't worth fixing. - * - * Look at the CIGAR 2M2P2D2P2M. Both M states border a deletion, separated by P (padding elements). So - * the right answer for deletions here is true for isBeforeDeletion() and isAfterDeletion() for the first - * and second M. But the LIBS_position doesn't say so. - * - * @param elements - * @return - */ - protected static boolean hasNeighboringPaddedOps(final List elements, final int elementI) { - return (elementI - 1 >= 0 && isPadding(elements.get(elementI-1))) || - (elementI + 1 < elements.size() && isPadding(elements.get(elementI+1))); - } - - private static boolean isPadding(final CigarElement elt) { - return elt.getOperator() == CigarOperator.P || elt.getOperator() == CigarOperator.H || elt.getOperator() == CigarOperator.S; - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java deleted file mode 100644 index 08cbecaf8..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java +++ /dev/null @@ -1,753 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.locusiterator; - -import htsjdk.samtools.CigarOperator; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.engine.ReadProperties; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.downsampling.DownsampleType; -import org.broadinstitute.gatk.engine.downsampling.DownsamplingMethod; -import org.broadinstitute.gatk.utils.NGSPlatform; -import org.broadinstitute.gatk.utils.QualityUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.pileup.PileupElement; -import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; -import org.broadinstitute.gatk.utils.sam.ArtificialBAMBuilder; -import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.*; - -/** - * testing of the new (non-legacy) version of LocusIteratorByState - */ -public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { - private static final boolean DEBUG = false; - protected LocusIteratorByState li; - - @Test(enabled = !DEBUG) - public void testUnmappedAndAllIReadsPassThrough() { - final int readLength = 10; - GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength); - GATKSAMRecord mapped2 = ArtificialSAMUtils.createArtificialRead(header,"mapped2",0,1,readLength); - GATKSAMRecord unmapped = ArtificialSAMUtils.createArtificialRead(header,"unmapped",0,1,readLength); - GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(header,"allI",0,1,readLength); - - unmapped.setReadUnmappedFlag(true); - unmapped.setCigarString("*"); - allI.setCigarString(readLength + "I"); - - List reads = Arrays.asList(mapped1, unmapped, allI, mapped2); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,createTestReadProperties(DownsamplingMethod.NONE, true)); - - Assert.assertTrue(li.hasNext()); - AlignmentContext context = li.next(); - ReadBackedPileup pileup = context.getBasePileup(); - Assert.assertEquals(pileup.depthOfCoverage(), 2, "Should see only 2 reads in pileup, even with unmapped and all I reads"); - - final List rawReads = li.transferReadsFromAllPreviousPileups(); - Assert.assertEquals(rawReads, reads, "Input and transferred read lists should be the same, and include the unmapped and all I reads"); - } - - @Test(enabled = true && ! DEBUG) - public void testXandEQOperators() { - final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; - final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - - GATKSAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); - r1.setReadBases(bases1); - r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - r1.setCigarString("10M"); - - GATKSAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); - r2.setReadBases(bases2); - r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); - r2.setCigarString("3=1X5=1X"); - - GATKSAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); - r3.setReadBases(bases2); - r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); - r3.setCigarString("3=1X5M1X"); - - GATKSAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); - r4.setReadBases(bases2); - r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - r4.setCigarString("10M"); - - List reads = Arrays.asList(r1, r2, r3, r4); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,readAttributes); - - while (li.hasNext()) { - AlignmentContext context = li.next(); - ReadBackedPileup pileup = context.getBasePileup(); - Assert.assertEquals(pileup.depthOfCoverage(), 4); - } - } - - @Test(enabled = true && ! DEBUG) - public void testIndelsInRegularPileup() { - final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; - final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - - GATKSAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); - before.setReadBases(bases); - before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - before.setCigarString("10M"); - - GATKSAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); - during.setReadBases(indelBases); - during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); - during.setCigarString("4M2I6M"); - - GATKSAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); - after.setReadBases(bases); - after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); - after.setCigarString("10M"); - - List reads = Arrays.asList(before, during, after); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads,readAttributes); - - boolean foundIndel = false; - while (li.hasNext()) { - AlignmentContext context = li.next(); - ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); - for (PileupElement p : pileup) { - if (p.isBeforeInsertion()) { - foundIndel = true; - Assert.assertEquals(p.getLengthOfImmediatelyFollowingIndel(), 2, "Wrong event length"); - Assert.assertEquals(p.getBasesOfImmediatelyFollowingInsertion(), "CT", "Inserted bases are incorrect"); - break; - } - } - - } - - Assert.assertTrue(foundIndel,"Indel in pileup not found"); - } - - @Test(enabled = false && ! DEBUG) - public void testWholeIndelReadInIsolation() { - final int firstLocus = 44367789; - - // create a test version of the Reads object - ReadProperties readAttributes = createTestReadProperties(); - - GATKSAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); - indelOnlyRead.setCigarString("76I"); - - List reads = Arrays.asList(indelOnlyRead); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, readAttributes); - - // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read - // and considers it to be an indel-containing read. - Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); - AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); - ReadBackedPileup basePileup = alignmentContext.getBasePileup(); - Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); - Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); - } - - /** - * Test to make sure that reads supporting only an indel (example cigar string: 76I) do - * not negatively influence the ordering of the pileup. - */ - @Test(enabled = true && ! DEBUG) - public void testWholeIndelRead() { - final int firstLocus = 44367788, secondLocus = firstLocus + 1; - - GATKSAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); - leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); - leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); - leadingRead.setCigarString("1M75I"); - - GATKSAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); - indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); - indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); - indelOnlyRead.setCigarString("76I"); - - GATKSAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); - fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); - fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); - fullMatchAfterIndel.setCigarString("75I1M"); - - List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, createTestReadProperties()); - int currentLocus = firstLocus; - int numAlignmentContextsFound = 0; - - while(li.hasNext()) { - AlignmentContext alignmentContext = li.next(); - Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); - - if(currentLocus == firstLocus) { - List readsAtLocus = alignmentContext.getBasePileup().getReads(); - Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); - Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); - } - else if(currentLocus == secondLocus) { - List readsAtLocus = alignmentContext.getBasePileup().getReads(); - Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); - Assert.assertSame(readsAtLocus.get(0),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); - } - - currentLocus++; - numAlignmentContextsFound++; - } - - Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); - } - - /** - * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly - */ - @Test(enabled = false && ! DEBUG) - public void testWholeIndelReadRepresentedTest() { - final int firstLocus = 44367788, secondLocus = firstLocus + 1; - - GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); - read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); - read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); - read1.setCigarString("1I"); - - List reads = Arrays.asList(read1); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, createTestReadProperties()); - - while(li.hasNext()) { - AlignmentContext alignmentContext = li.next(); - ReadBackedPileup p = alignmentContext.getBasePileup(); - Assert.assertTrue(p.getNumberOfElements() == 1); - // TODO -- fix tests -// PileupElement pe = p.iterator().next(); -// Assert.assertTrue(pe.isBeforeInsertion()); -// Assert.assertFalse(pe.isAfterInsertion()); -// Assert.assertEquals(pe.getBasesOfImmediatelyFollowingInsertion(), "A"); - } - - GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); - read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); - read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); - read2.setCigarString("10I"); - - reads = Arrays.asList(read2); - - // create the iterator by state with the fake reads and fake records - li = makeLTBS(reads, createTestReadProperties()); - - while(li.hasNext()) { - AlignmentContext alignmentContext = li.next(); - ReadBackedPileup p = alignmentContext.getBasePileup(); - Assert.assertTrue(p.getNumberOfElements() == 1); - // TODO -- fix tests -// PileupElement pe = p.iterator().next(); -// Assert.assertTrue(pe.isBeforeInsertion()); -// Assert.assertFalse(pe.isAfterInsertion()); -// Assert.assertEquals(pe.getBasesOfImmediatelyFollowingInsertion(), "AAAAAAAAAA"); - } - } - - - ///////////////////////////////////////////// - // get event length and bases calculations // - ///////////////////////////////////////////// - - @DataProvider(name = "IndelLengthAndBasesTest") - public Object[][] makeIndelLengthAndBasesTest() { - final String EVENT_BASES = "ACGTACGTACGT"; - final List tests = new LinkedList(); - - for ( int eventSize = 1; eventSize < 10; eventSize++ ) { - for ( final CigarOperator indel : Arrays.asList(CigarOperator.D, CigarOperator.I) ) { - final String cigar = String.format("2M%d%s1M", eventSize, indel.toString()); - final String eventBases = indel == CigarOperator.D ? "" : EVENT_BASES.substring(0, eventSize); - final int readLength = 3 + eventBases.length(); - - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, readLength); - read.setReadBases(("TT" + eventBases + "A").getBytes()); - final byte[] quals = new byte[readLength]; - for ( int i = 0; i < readLength; i++ ) - quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); - read.setBaseQualities(quals); - read.setCigarString(cigar); - - tests.add(new Object[]{read, indel, eventSize, eventBases.equals("") ? null : eventBases}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "IndelLengthAndBasesTest") - public void testIndelLengthAndBasesTest(GATKSAMRecord read, final CigarOperator op, final int eventSize, final String eventBases) { - // create the iterator by state with the fake reads and fake records - li = makeLTBS(Arrays.asList((GATKSAMRecord)read), createTestReadProperties()); - - Assert.assertTrue(li.hasNext()); - - final PileupElement firstMatch = getFirstPileupElement(li.next()); - - Assert.assertEquals(firstMatch.getLengthOfImmediatelyFollowingIndel(), 0, "Length != 0 for site not adjacent to indel"); - Assert.assertEquals(firstMatch.getBasesOfImmediatelyFollowingInsertion(), null, "Getbases of following event should be null at non-adajenct event"); - - Assert.assertTrue(li.hasNext()); - - final PileupElement pe = getFirstPileupElement(li.next()); - - if ( op == CigarOperator.D ) - Assert.assertTrue(pe.isBeforeDeletionStart()); - else - Assert.assertTrue(pe.isBeforeInsertion()); - - Assert.assertEquals(pe.getLengthOfImmediatelyFollowingIndel(), eventSize, "Length of event failed"); - Assert.assertEquals(pe.getBasesOfImmediatelyFollowingInsertion(), eventBases, "Getbases of following event failed"); - } - - private PileupElement getFirstPileupElement(final AlignmentContext context) { - final ReadBackedPileup p = context.getBasePileup(); - Assert.assertEquals(p.getNumberOfElements(), 1); - return p.iterator().next(); - } - - //////////////////////////////////////////// - // comprehensive LIBS/PileupElement tests // - //////////////////////////////////////////// - - @DataProvider(name = "MyLIBSTest") - public Object[][] makeLIBSTest() { - final List tests = new LinkedList(); - -// tests.add(new Object[]{new LIBSTest("2=2D2=2X", 1)}); -// return tests.toArray(new Object[][]{}); - - return createLIBSTests( - Arrays.asList(1, 2), - Arrays.asList(1, 2, 3, 4)); - -// return createLIBSTests( -// Arrays.asList(2), -// Arrays.asList(3)); - } - - @Test(enabled = ! DEBUG, dataProvider = "MyLIBSTest") - public void testLIBS(LIBSTest params) { - // create the iterator by state with the fake reads and fake records - final GATKSAMRecord read = params.makeRead(); - li = makeLTBS(Arrays.asList((GATKSAMRecord)read), createTestReadProperties()); - final LIBS_position tester = new LIBS_position(read); - - int bpVisited = 0; - int lastOffset = 0; - while ( li.hasNext() ) { - bpVisited++; - - AlignmentContext alignmentContext = li.next(); - ReadBackedPileup p = alignmentContext.getBasePileup(); - Assert.assertEquals(p.getNumberOfElements(), 1); - PileupElement pe = p.iterator().next(); - - Assert.assertEquals(p.getNumberOfDeletions(), pe.isDeletion() ? 1 : 0, "wrong number of deletions in the pileup"); - Assert.assertEquals(p.getNumberOfMappingQualityZeroReads(), pe.getRead().getMappingQuality() == 0 ? 1 : 0, "wront number of mapq reads in the pileup"); - - tester.stepForwardOnGenome(); - - if ( ! hasNeighboringPaddedOps(params.getElements(), pe.getCurrentCigarOffset()) ) { - Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart, "before deletion start failure"); - Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd, "after deletion end failure"); - } - - Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion, "before insertion failure"); - Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion, "after insertion failure"); - Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip, "next to soft clip failure"); - - Assert.assertTrue(pe.getOffset() >= lastOffset, "Somehow read offsets are decreasing: lastOffset " + lastOffset + " current " + pe.getOffset()); - Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset(), "Read offsets are wrong at " + bpVisited); - - Assert.assertEquals(pe.getCurrentCigarElement(), read.getCigar().getCigarElement(tester.currentOperatorIndex), "CigarElement index failure"); - Assert.assertEquals(pe.getOffsetInCurrentCigar(), tester.getCurrentPositionOnOperatorBase0(), "CigarElement index failure"); - - Assert.assertEquals(read.getCigar().getCigarElement(pe.getCurrentCigarOffset()), pe.getCurrentCigarElement(), "Current cigar element isn't what we'd get from the read itself"); - - Assert.assertTrue(pe.getOffsetInCurrentCigar() >= 0, "Offset into current cigar too small"); - Assert.assertTrue(pe.getOffsetInCurrentCigar() < pe.getCurrentCigarElement().getLength(), "Offset into current cigar too big"); - - Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset(), "Read offset failure"); - lastOffset = pe.getOffset(); - } - - final int expectedBpToVisit = read.getAlignmentEnd() - read.getAlignmentStart() + 1; - Assert.assertEquals(bpVisited, expectedBpToVisit, "Didn't visit the expected number of bp"); - } - - // ------------------------------------------------------------ - // - // Tests for keeping reads - // - // ------------------------------------------------------------ - - @DataProvider(name = "LIBS_ComplexPileupTests") - public Object[][] makeLIBS_ComplexPileupTests() { - final List tests = new LinkedList(); - - for ( final int downsampleTo : Arrays.asList(-1, 1, 2, 5, 10, 30)) { - for ( final int nReadsPerLocus : Arrays.asList(1, 10, 60) ) { - for ( final int nLoci : Arrays.asList(1, 10, 25) ) { - for ( final int nSamples : Arrays.asList(1, 2, 10) ) { - for ( final boolean keepReads : Arrays.asList(true, false) ) { - for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true, false) ) { -// for ( final int downsampleTo : Arrays.asList(1)) { -// for ( final int nReadsPerLocus : Arrays.asList(1) ) { -// for ( final int nLoci : Arrays.asList(1) ) { -// for ( final int nSamples : Arrays.asList(1) ) { -// for ( final boolean keepReads : Arrays.asList(true) ) { -// for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) { - tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, - keepReads, grabReadsAfterEachCycle, - downsampleTo}); - } - } - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests") - public void testLIBS_ComplexPileupTests(final int nReadsPerLocus, - final int nLoci, - final int nSamples, - final boolean keepReads, - final boolean grabReadsAfterEachCycle, - final int downsampleTo) { - //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); - final int readLength = 10; - - final boolean downsample = downsampleTo != -1; - final DownsamplingMethod downsampler = downsample - ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null) - : new DownsamplingMethod(DownsampleType.NONE, null, null); - - final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(header.getSequenceDictionary(), nReadsPerLocus, nLoci); - bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1); - - final List reads = bamBuilder.makeReads(); - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), - createTestReadProperties(downsampler, keepReads), - genomeLocParser, - bamBuilder.getSamples()); - - final Set seenSoFar = new HashSet(); - final Set keptReads = new HashSet(); - int bpVisited = 0; - while ( li.hasNext() ) { - bpVisited++; - final AlignmentContext alignmentContext = li.next(); - final ReadBackedPileup p = alignmentContext.getBasePileup(); - - AssertWellOrderedPileup(p); - - if ( downsample ) { - // just not a safe test - //Assert.assertTrue(p.getNumberOfElements() <= maxDownsampledCoverage * nSamples, "Too many reads at locus after downsampling"); - } else { - final int minPileupSize = nReadsPerLocus * nSamples; - Assert.assertTrue(p.getNumberOfElements() >= minPileupSize); - } - - // the number of reads starting here - int nReadsStartingHere = 0; - for ( final GATKSAMRecord read : p.getReads() ) - if ( read.getAlignmentStart() == alignmentContext.getPosition() ) - nReadsStartingHere++; - - // we can have no more than maxDownsampledCoverage per sample - final int maxCoveragePerLocus = downsample ? downsampleTo : nReadsPerLocus; - Assert.assertTrue(nReadsStartingHere <= maxCoveragePerLocus * nSamples); - - seenSoFar.addAll(p.getReads()); - if ( keepReads && grabReadsAfterEachCycle ) { - final List locusReads = li.transferReadsFromAllPreviousPileups(); - - - if ( downsample ) { - // with downsampling we might have some reads here that were downsampled away - // in the pileup. We want to ensure that no more than the max coverage per sample is added - Assert.assertTrue(locusReads.size() >= nReadsStartingHere); - Assert.assertTrue(locusReads.size() <= maxCoveragePerLocus * nSamples); - } else { - Assert.assertEquals(locusReads.size(), nReadsStartingHere); - } - keptReads.addAll(locusReads); - - // check that all reads we've seen so far are in our keptReads - for ( final GATKSAMRecord read : seenSoFar ) { - Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read); - } - } - - if ( ! keepReads ) - Assert.assertTrue(li.getReadsFromAllPreviousPileups().isEmpty(), "Not keeping reads but the underlying list of reads isn't empty"); - } - - if ( keepReads && ! grabReadsAfterEachCycle ) - keptReads.addAll(li.transferReadsFromAllPreviousPileups()); - - if ( ! downsample ) { // downsampling may drop loci - final int expectedBpToVisit = nLoci + readLength - 1; - Assert.assertEquals(bpVisited, expectedBpToVisit, "Didn't visit the expected number of bp"); - } - - if ( keepReads ) { - // check we have the right number of reads - final int totalReads = nLoci * nReadsPerLocus * nSamples; - if ( ! downsample ) { // downsampling may drop reads - Assert.assertEquals(keptReads.size(), totalReads, "LIBS didn't keep the right number of reads during the traversal"); - - // check that the order of reads is the same as in our read list - for ( int i = 0; i < reads.size(); i++ ) { - final GATKSAMRecord inputRead = reads.get(i); - final GATKSAMRecord keptRead = reads.get(i); - Assert.assertSame(keptRead, inputRead, "Input reads and kept reads differ at position " + i); - } - } else { - Assert.assertTrue(keptReads.size() <= totalReads, "LIBS didn't keep the right number of reads during the traversal"); - } - - // check uniqueness - final Set readNames = new HashSet(); - for ( final GATKSAMRecord read : keptReads ) { - Assert.assertFalse(readNames.contains(read.getReadName()), "Found duplicate reads in the kept reads"); - readNames.add(read.getReadName()); - } - - // check that all reads we've seen are in our keptReads - for ( final GATKSAMRecord read : seenSoFar ) { - Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read); - } - - if ( ! downsample ) { - // check that every read in the list of keep reads occurred at least once in one of the pileups - for ( final GATKSAMRecord keptRead : keptReads ) { - Assert.assertTrue(seenSoFar.contains(keptRead), "There's a read " + keptRead + " in our keptReads list that never appeared in any pileup"); - } - } - } - } - - private void AssertWellOrderedPileup(final ReadBackedPileup pileup) { - if ( ! pileup.isEmpty() ) { - int leftMostPos = -1; - - for ( final PileupElement pe : pileup ) { - Assert.assertTrue(pileup.getLocation().getContig().equals(pe.getRead().getReferenceName()), "ReadBackedPileup contains an element " + pe + " that's on a different contig than the pileup itself"); - Assert.assertTrue(pe.getRead().getAlignmentStart() >= leftMostPos, - "ReadBackedPileup contains an element " + pe + " whose read's alignment start " + pe.getRead().getAlignmentStart() - + " occurs before the leftmost position we've seen previously " + leftMostPos); - } - } - } - - // --------------------------------------------------------------------------- - // make sure that downsampling isn't holding onto a bazillion reads - // - @DataProvider(name = "LIBS_NotHoldingTooManyReads") - public Object[][] makeLIBS_NotHoldingTooManyReads() { - final List tests = new LinkedList(); - - for ( final int downsampleTo : Arrays.asList(1, 10)) { - for ( final int nReadsPerLocus : Arrays.asList(100, 1000, 10000, 100000) ) { - for ( final int payloadInBytes : Arrays.asList(0, 1024, 1024*1024) ) { - tests.add(new Object[]{nReadsPerLocus, downsampleTo, payloadInBytes}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_NotHoldingTooManyReads") -// @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000) - public void testLIBS_NotHoldingTooManyReads(final int nReadsPerLocus, final int downsampleTo, final int payloadInBytes) { - logger.warn(String.format("testLIBS_NotHoldingTooManyReads %d %d %d", nReadsPerLocus, downsampleTo, payloadInBytes)); - final int readLength = 10; - - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); - final int nSamples = 1; - final List samples = new ArrayList(nSamples); - for ( int i = 0; i < nSamples; i++ ) { - final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); - final String sample = "sample" + i; - samples.add(sample); - rg.setSample(sample); - rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); - header.addReadGroup(rg); - } - - final boolean downsample = downsampleTo != -1; - final DownsamplingMethod downsampler = downsample - ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null) - : new DownsamplingMethod(DownsampleType.NONE, null, null); - - // final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); - - final WeakReadTrackingIterator iterator = new WeakReadTrackingIterator(nReadsPerLocus, readLength, payloadInBytes, header); - - li = new LocusIteratorByState(iterator, - createTestReadProperties(downsampler, false), - genomeLocParser, - samples); - - while ( li.hasNext() ) { - final AlignmentContext next = li.next(); - Assert.assertTrue(next.getBasePileup().getNumberOfElements() <= downsampleTo, "Too many elements in pileup " + next); - // TODO -- assert that there are <= X reads in memory after GC for some X - } - } - - private static class WeakReadTrackingIterator implements Iterator { - final int nReads, readLength, payloadInBytes; - int readI = 0; - final SAMFileHeader header; - - private WeakReadTrackingIterator(int nReads, int readLength, final int payloadInBytes, final SAMFileHeader header) { - this.nReads = nReads; - this.readLength = readLength; - this.header = header; - this.payloadInBytes = payloadInBytes; - } - - @Override public boolean hasNext() { return readI < nReads; } - @Override public void remove() { throw new UnsupportedOperationException("no remove"); } - - @Override - public GATKSAMRecord next() { - readI++; - return makeRead(); - } - - private GATKSAMRecord makeRead() { - final SAMReadGroupRecord rg = header.getReadGroups().get(0); - final String readName = String.format("%s.%d.%s", "read", readI, rg.getId()); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, 1, readLength); - read.setReadGroup(new GATKSAMReadGroupRecord(rg)); - if ( payloadInBytes > 0 ) - // add a payload byte array to push memory use per read even higher - read.setAttribute("PL", new byte[payloadInBytes]); - return read; - } - } - - // --------------------------------------------------------------------------- - // - // make sure that adapter clipping is working properly in LIBS - // - // --------------------------------------------------------------------------- - @DataProvider(name = "AdapterClippingTest") - public Object[][] makeAdapterClippingTest() { - final List tests = new LinkedList(); - - final int start = 10; - for ( final int goodBases : Arrays.asList(10, 20, 30) ) { - for ( final int nClips : Arrays.asList(0, 1, 2, 10)) { - for ( final boolean onLeft : Arrays.asList(true, false) ) { - final int readLength = nClips + goodBases; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength); - read.setProperPairFlag(true); - read.setReadPairedFlag(true); - read.setReadUnmappedFlag(false); - read.setMateUnmappedFlag(false); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte) '@', readLength)); - read.setCigarString(readLength + "M"); - - if ( onLeft ) { - read.setReadNegativeStrandFlag(true); - read.setMateNegativeStrandFlag(false); - read.setMateAlignmentStart(start + nClips); - read.setInferredInsertSize(readLength); - tests.add(new Object[]{nClips, goodBases, 0, read}); - } else { - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - read.setMateAlignmentStart(start - 1); - read.setInferredInsertSize(goodBases - 1); - tests.add(new Object[]{0, goodBases, nClips, read}); - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = true, dataProvider = "AdapterClippingTest") - public void testAdapterClipping(final int nClipsOnLeft, final int nReadContainingPileups, final int nClipsOnRight, final GATKSAMRecord read) { - - li = new LocusIteratorByState(new FakeCloseableIterator<>(Collections.singletonList(read).iterator()), - createTestReadProperties(DownsamplingMethod.NONE, false), - genomeLocParser, - LocusIteratorByState.sampleListForSAMWithoutReadGroups()); - - int expectedPos = read.getAlignmentStart() + nClipsOnLeft; - int nPileups = 0; - while ( li.hasNext() ) { - final AlignmentContext next = li.next(); - Assert.assertEquals(next.getLocation().getStart(), expectedPos); - nPileups++; - expectedPos++; - } - - final int nExpectedPileups = nReadContainingPileups; - Assert.assertEquals(nPileups, nExpectedPileups, "Wrong number of pileups seen"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtilsUnitTest.java deleted file mode 100644 index 48ad2129d..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtilsUnitTest.java +++ /dev/null @@ -1,108 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.iterators.GATKSAMIterator; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; -import static org.testng.Assert.fail; -import org.testng.annotations.Test; -import htsjdk.samtools.SAMRecord; - -/** - * Created by IntelliJ IDEA. - * User: aaronmckenna - * Date: Jun 3, 2009 - * Time: 3:09:34 AM - * To change this template use File | Settings | File Templates. - */ -public class ArtificialSAMUtilsUnitTest extends BaseTest { - - - @Test - public void basicReadIteratorTest() { - GATKSAMIterator iter = ArtificialSAMUtils.mappedReadIterator(1, 100, 100); - int count = 0; - while (iter.hasNext()) { - SAMRecord rec = iter.next(); - count++; - } - assertEquals(count, 100 * 100); - } - - @Test - public void tenPerChromosome() { - GATKSAMIterator iter = ArtificialSAMUtils.mappedReadIterator(1, 100, 10); - int count = 0; - while (iter.hasNext()) { - SAMRecord rec = iter.next(); - - assertEquals(Integer.valueOf(Math.round(count / 10)), rec.getReferenceIndex()); - count++; - } - assertEquals(count, 100 * 10); - } - - @Test - public void onePerChromosome() { - GATKSAMIterator iter = ArtificialSAMUtils.mappedReadIterator(1, 100, 1); - int count = 0; - while (iter.hasNext()) { - SAMRecord rec = iter.next(); - - assertEquals(Integer.valueOf(count), rec.getReferenceIndex()); - count++; - } - assertEquals(count, 100 * 1); - } - - @Test - public void basicUnmappedIteratorTest() { - GATKSAMIterator iter = ArtificialSAMUtils.mappedAndUnmappedReadIterator(1, 100, 100, 1000); - int count = 0; - for (int x = 0; x < (100* 100); x++ ) { - if (!iter.hasNext()) { - fail ("we didn't get the expected number of reads"); - } - SAMRecord rec = iter.next(); - assertTrue(rec.getReferenceIndex() >= 0); - count++; - } - assertEquals(100 * 100, count); - - // now we should have 1000 unmapped reads - count = 0; - while (iter.hasNext()) { - SAMRecord rec = iter.next(); - assertTrue(rec.getReferenceIndex() < 0); - count++; - } - assertEquals(count, 1000); - } - - -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java deleted file mode 100644 index 271a75ad9..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java +++ /dev/null @@ -1,186 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; - -import org.broadinstitute.gatk.utils.BaseTest; - -public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { - - private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { - private ArtificialSingleSampleReadStream stream; - private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; - - public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { - super(ArtificialSingleSampleReadStreamTest.class); - - this.stream = stream; - - setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", - getClass().getSimpleName(), - stream.getNumContigs(), - stream.getNumStacksPerContig(), - stream.getMinReadsPerStack(), - stream.getMaxReadsPerStack(), - stream.getMinDistanceBetweenStacks(), - stream.getMaxDistanceBetweenStacks(), - stream.getMinReadLength(), - stream.getMaxReadLength(), - stream.getNumUnmappedReads())); - } - - public void run() { - streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); - - streamAnalyzer.analyze(stream); - - // Check whether the observed properties of the stream match its nominal properties - streamAnalyzer.validate(); - } - } - - @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") - public Object[][] createArtificialSingleSampleReadStreamTests() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); - String readGroupID = "testReadGroup"; - SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); - readGroup.setSample("testSample"); - header.addReadGroup(readGroup); - - GenomeAnalysisEngine.resetRandomGenerator(); - - // brute force testing! - for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { - for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { - for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { - for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { - for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { - for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { - for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { - for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { - for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { - // Only test sane combinations here - if ( minReadsPerStack <= maxReadsPerStack && - minDistanceBetweenStacks <= maxDistanceBetweenStacks && - minReadLength <= maxReadLength && - ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { - - new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, - readGroupID, - numContigs, - stacksPerContig, - minReadsPerStack, - maxReadsPerStack, - minDistanceBetweenStacks, - maxDistanceBetweenStacks, - minReadLength, - maxReadLength, - numUnmappedReads)); - } - } - } - } - } - } - } - } - } - } - - return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); - } - - @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") - public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { - logger.warn("Running test: " + test); - - GenomeAnalysisEngine.resetRandomGenerator(); - test.run(); - } - - @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") - public Object[][] createInvalidArgumentsTests() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); - String readGroupID = "testReadGroup"; - header.addReadGroup(new SAMReadGroupRecord(readGroupID)); - - return new Object[][] { - {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, - {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, - {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, - {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, - {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, - {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, - {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, - {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, - {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, - {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, - {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, - {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, - {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, - }; - } - - @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", - expectedExceptions = ReviewedGATKException.class) - public void testInvalidArguments( String testName, - SAMFileHeader header, - String readGroupID, - int numContigs, - int numStacksPerContig, - int minReadsPerStack, - int maxReadsPerStack, - int minDistanceBetweenStacks, - int maxDistanceBetweenStacks, - int minReadLength, - int maxReadLength, - int numUnmappedReads ) { - - logger.warn("Running test: " + testName); - - ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, - readGroupID, - numContigs, - numStacksPerContig, - minReadsPerStack, - maxReadsPerStack, - minDistanceBetweenStacks, - maxDistanceBetweenStacks, - minReadLength, - maxReadLength, - numUnmappedReads); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/MisencodedBaseQualityUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/MisencodedBaseQualityUnitTest.java deleted file mode 100644 index 207e01ab9..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/MisencodedBaseQualityUnitTest.java +++ /dev/null @@ -1,96 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - - -import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Basic unit test for misencoded quals - */ -public class MisencodedBaseQualityUnitTest extends BaseTest { - - private static final String readBases = "AAAAAAAAAA"; - private static final byte[] badQuals = { 59, 60, 62, 63, 64, 61, 62, 58, 57, 56 }; - private static final byte[] goodQuals = { 60, 60, 60, 60, 60, 60, 60, 60, 60, 60 }; - private static final byte[] fixedQuals = { 28, 29, 31, 32, 33, 30, 31, 27, 26, 25 }; - private SAMFileHeader header; - - @BeforeMethod - public void before() { - // reset the read counter so that we are deterministic - MisencodedBaseQualityReadTransformer.currentReadCounter = 0; - header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - } - - private GATKSAMRecord createRead(final boolean useGoodBases) { - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, readBases.getBytes(), - useGoodBases ? Arrays.copyOf(goodQuals, goodQuals.length) : - Arrays.copyOf(badQuals, badQuals.length)); - read.setCigarString("10M"); - return read; - } - - @Test(enabled = true) - public void testGoodQuals() { - final List reads = new ArrayList(10000); - for ( int i = 0; i < 10000; i++ ) - reads.add(createRead(true)); - - testEncoding(reads); - } - - @Test(enabled = true, expectedExceptions = {UserException.class}) - public void testBadQualsThrowsError() { - final List reads = new ArrayList(10000); - for ( int i = 0; i < 10000; i++ ) - reads.add(createRead(false)); - - testEncoding(reads); - } - - @Test(enabled = true) - public void testFixBadQuals() { - final GATKSAMRecord read = createRead(false); - final GATKSAMRecord fixedRead = MisencodedBaseQualityReadTransformer.fixMisencodedQuals(read); - for ( int i = 0; i < fixedQuals.length; i++ ) - Assert.assertEquals(fixedQuals[i], fixedRead.getBaseQualities()[i]); - } - - private void testEncoding(final List reads) { - for ( final GATKSAMRecord read : reads ) - MisencodedBaseQualityReadTransformer.checkForMisencodedQuals(read); - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java deleted file mode 100644 index c7ceea10a..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java +++ /dev/null @@ -1,340 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.sam; - -import htsjdk.samtools.reference.IndexedFastaSequenceFile; -import htsjdk.samtools.SAMFileHeader; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.BaseUtils; -import org.broadinstitute.gatk.utils.Utils; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.*; - - -public class ReadUtilsUnitTest extends BaseTest { - private interface GetAdaptorFunc { - public int getAdaptor(final GATKSAMRecord record); - } - - @DataProvider(name = "AdaptorGetter") - public Object[][] makeActiveRegionCutTests() { - final List tests = new LinkedList(); - - tests.add( new Object[]{ new GetAdaptorFunc() { - @Override public int getAdaptor(final GATKSAMRecord record) { return ReadUtils.getAdaptorBoundary(record); } - }}); - - tests.add( new Object[]{ new GetAdaptorFunc() { - @Override public int getAdaptor(final GATKSAMRecord record) { return record.getAdaptorBoundary(); } - }}); - - return tests.toArray(new Object[][]{}); - } - - private GATKSAMRecord makeRead(final int fragmentSize, final int mateStart) { - final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; - final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30}; - final String cigar = "8M"; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); - read.setProperPairFlag(true); - read.setReadPairedFlag(true); - read.setMateAlignmentStart(mateStart); - read.setInferredInsertSize(fragmentSize); - return read; - } - - @Test(dataProvider = "AdaptorGetter") - public void testGetAdaptorBoundary(final GetAdaptorFunc get) { - final int fragmentSize = 10; - final int mateStart = 1000; - final int BEFORE = mateStart - 2; - final int AFTER = mateStart + 2; - int myStart, boundary; - GATKSAMRecord read; - - // Test case 1: positive strand, first read - read = makeRead(fragmentSize, mateStart); - myStart = BEFORE; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, myStart + fragmentSize + 1); - - // Test case 2: positive strand, second read - read = makeRead(fragmentSize, mateStart); - myStart = AFTER; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, myStart + fragmentSize + 1); - - // Test case 3: negative strand, second read - read = makeRead(fragmentSize, mateStart); - myStart = AFTER; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(true); - read.setMateNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, mateStart - 1); - - // Test case 4: negative strand, first read - read = makeRead(fragmentSize, mateStart); - myStart = BEFORE; - read.setAlignmentStart(myStart); - read.setReadNegativeStrandFlag(true); - read.setMateNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, mateStart - 1); - - // Test case 5: mate is mapped to another chromosome (test both strands) - read = makeRead(fragmentSize, mateStart); - read.setInferredInsertSize(0); - read.setReadNegativeStrandFlag(true); - read.setMateNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - read.setInferredInsertSize(10); - - // Test case 6: read is unmapped - read = makeRead(fragmentSize, mateStart); - read.setReadUnmappedFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - read.setReadUnmappedFlag(false); - - // Test case 7: reads don't overlap and look like this: - // <--------| - // |------> - // first read: - read = makeRead(fragmentSize, mateStart); - myStart = 980; - read.setAlignmentStart(myStart); - read.setInferredInsertSize(20); - read.setReadNegativeStrandFlag(true); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - - // second read: - read = makeRead(fragmentSize, mateStart); - myStart = 1000; - read.setAlignmentStart(myStart); - read.setInferredInsertSize(20); - read.setMateAlignmentStart(980); - read.setReadNegativeStrandFlag(false); - boundary = get.getAdaptor(read); - Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - - // Test case 8: read doesn't have proper pair flag set - read = makeRead(fragmentSize, mateStart); - read.setReadPairedFlag(true); - read.setProperPairFlag(false); - Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); - - // Test case 9: read and mate have same negative flag setting - for ( final boolean negFlag: Arrays.asList(true, false) ) { - read = makeRead(fragmentSize, mateStart); - read.setAlignmentStart(BEFORE); - read.setReadPairedFlag(true); - read.setProperPairFlag(true); - read.setReadNegativeStrandFlag(negFlag); - read.setMateNegativeStrandFlag(!negFlag); - Assert.assertTrue(get.getAdaptor(read) != ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have succeeded"); - - read = makeRead(fragmentSize, mateStart); - read.setAlignmentStart(BEFORE); - read.setReadPairedFlag(true); - read.setProperPairFlag(true); - read.setReadNegativeStrandFlag(negFlag); - read.setMateNegativeStrandFlag(negFlag); - Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have failed for reads with bad alignment orientation"); - } - } - - @Test (enabled = true) - public void testGetBasesReverseComplement() { - int iterations = 1000; - Random random = GenomeAnalysisEngine.getRandomGenerator(); - while(iterations-- > 0) { - final int l = random.nextInt(1000); - GATKSAMRecord read = GATKSAMRecord.createRandomRead(l); - byte [] original = read.getReadBases(); - byte [] reconverted = new byte[l]; - String revComp = ReadUtils.getBasesReverseComplement(read); - for (int i=0; i reads = new ArrayList(); - for( int readLength = minLength; readLength <= maxLength; readLength++ ) { - reads.add( ReadUtils.createRandomRead( readLength ) ); - } - Assert.assertEquals(ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match"); - } - } - - final List reads = new LinkedList(); - Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); - } - - @Test (enabled = true) - public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException { - - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); - final int readLength = 76; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setCigarString("3M414N1D73M"); - - final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); - Assert.assertEquals(result, 2); - } - - @Test (enabled = true) - public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { - - final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); - final int readLength = 76; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setCigarString("3M414N1D73M"); - - final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9393, ReadUtils.ClippingTail.LEFT_TAIL); - Assert.assertEquals(result, 3); - } - - @DataProvider(name = "HasWellDefinedFragmentSizeData") - public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { - final List tests = new LinkedList(); - - // setup a basic read that will work - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10); - read.setReadPairedFlag(true); - read.setProperPairFlag(true); - read.setReadUnmappedFlag(false); - read.setMateUnmappedFlag(false); - read.setAlignmentStart(100); - read.setCigarString("50M"); - read.setMateAlignmentStart(130); - read.setInferredInsertSize(80); - read.setFirstOfPairFlag(true); - read.setReadNegativeStrandFlag(false); - read.setMateNegativeStrandFlag(true); - - tests.add( new Object[]{ "basic case", read.clone(), true }); - - { - final GATKSAMRecord bad1 = (GATKSAMRecord)read.clone(); - bad1.setReadPairedFlag(false); - tests.add( new Object[]{ "not paired", bad1, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setProperPairFlag(false); - // we currently don't require the proper pair flag to be set - tests.add( new Object[]{ "not proper pair", bad, true }); -// tests.add( new Object[]{ "not proper pair", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setReadUnmappedFlag(true); - tests.add( new Object[]{ "read is unmapped", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setMateUnmappedFlag(true); - tests.add( new Object[]{ "mate is unmapped", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setMateNegativeStrandFlag(false); - tests.add( new Object[]{ "read and mate both on positive strand", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setReadNegativeStrandFlag(true); - tests.add( new Object[]{ "read and mate both on negative strand", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setInferredInsertSize(0); - tests.add( new Object[]{ "insert size is 0", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setAlignmentStart(1000); - tests.add( new Object[]{ "positve read starts after mate end", bad, false }); - } - - { - final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); - bad.setReadNegativeStrandFlag(true); - bad.setMateNegativeStrandFlag(false); - bad.setMateAlignmentStart(1000); - tests.add( new Object[]{ "negative strand read ends before mate starts", bad, false }); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "HasWellDefinedFragmentSizeData") - private void testHasWellDefinedFragmentSize(final String name, final GATKSAMRecord read, final boolean expected) { - Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/text/ListFileUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/text/ListFileUtilsUnitTest.java deleted file mode 100644 index 086cefed7..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/text/ListFileUtilsUnitTest.java +++ /dev/null @@ -1,159 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.text; - -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.utils.commandline.ParsingEngine; -import org.broadinstitute.gatk.utils.commandline.Tags; -import org.broadinstitute.gatk.engine.CommandLineGATK; -import org.broadinstitute.gatk.engine.datasources.reads.SAMReaderID; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.PrintWriter; -import java.util.*; - -/** - * Tests selected functionality in the CommandLineExecutable class - */ -public class ListFileUtilsUnitTest extends BaseTest { - - @Test - public void testIgnoreBlankLinesInBAMListFiles() throws Exception { - File tempListFile = createTempListFile("testIgnoreBlankLines", - "", - publicTestDir + "exampleBAM.bam", - " " - ); - - List expectedBAMFileListAfterUnpacking = new ArrayList(); - expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); - - performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); - } - - @Test - public void testCommentSupportInBAMListFiles() throws Exception { - File tempListFile = createTempListFile("testCommentSupport", - "#", - publicTestDir + "exampleBAM.bam", - "#" + publicTestDir + "foo.bam", - " # " + publicTestDir + "bar.bam" - ); - - List expectedBAMFileListAfterUnpacking = new ArrayList(); - expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); - - performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); - } - - @Test - public void testUnpackSet() throws Exception { - Set expected = new HashSet(Arrays.asList(publicTestDir + "exampleBAM.bam")); - Set actual; - - actual = ListFileUtils.unpackSet(Arrays.asList(publicTestDir + "exampleBAM.bam")); - Assert.assertEquals(actual, expected); - - File tempListFile = createTempListFile("testUnpackSet", - "#", - publicTestDir + "exampleBAM.bam", - "#" + publicTestDir + "foo.bam", - " # " + publicTestDir + "bar.bam" - ); - actual = ListFileUtils.unpackSet(Arrays.asList(tempListFile.getAbsolutePath())); - Assert.assertEquals(actual, expected); - } - - @DataProvider(name="includeMatchingTests") - public Object[][] getIncludeMatchingTests() { - return new Object[][] { - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("a") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, asSet("a", "ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, Collections.EMPTY_SET }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("a") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, asSet("a", "ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("a", "ab") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, asSet("a", "ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, Collections.EMPTY_SET }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, Collections.EMPTY_SET }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, asSet("a", "ab", "abc") } - }; - } - - @Test(dataProvider = "includeMatchingTests") - public void testIncludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { - Set actual = ListFileUtils.includeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); - Assert.assertEquals(actual, expected); - } - - @DataProvider(name="excludeMatchingTests") - public Object[][] getExcludeMatchingTests() { - return new Object[][] { - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, Collections.EMPTY_SET }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, asSet("a", "ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("a") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, Collections.EMPTY_SET }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, Collections.EMPTY_SET }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, asSet("a", "ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("a") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, asSet("a", "ab", "abc") }, - new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, Collections.EMPTY_SET } - }; - } - - @Test(dataProvider = "excludeMatchingTests") - public void testExcludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { - Set actual = ListFileUtils.excludeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); - Assert.assertEquals(actual, expected); - } - - private static Set asSet(T... args){ - return new HashSet(Arrays.asList(args)); - } - - private void performBAMListFileUnpackingTest( File tempListFile, List expectedUnpackedFileList ) throws Exception { - List bamFiles = new ArrayList(); - bamFiles.add(tempListFile.getAbsolutePath()); - - CommandLineGATK testInstance = new CommandLineGATK(); - testInstance.setParser(new ParsingEngine(testInstance)); - - List unpackedBAMFileList = ListFileUtils.unpackBAMFileList(bamFiles,new ParsingEngine(testInstance)); - - Assert.assertEquals(unpackedBAMFileList.size(), expectedUnpackedFileList.size(), - "Unpacked BAM file list contains extraneous lines"); - Assert.assertEquals(unpackedBAMFileList, expectedUnpackedFileList, - "Unpacked BAM file list does not contain correct BAM file names"); - } -} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVCFUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVCFUtilsUnitTest.java deleted file mode 100644 index ab547b7c1..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVCFUtilsUnitTest.java +++ /dev/null @@ -1,138 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.tribble.index.DynamicIndexCreator; -import htsjdk.tribble.index.IndexCreator; -import htsjdk.tribble.index.interval.IntervalIndexCreator; -import htsjdk.tribble.index.linear.LinearIndexCreator; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.engine.contexts.AlignmentContext; -import org.broadinstitute.gatk.engine.contexts.ReferenceContext; -import org.broadinstitute.gatk.engine.refdata.RefMetaDataTracker; -import org.broadinstitute.gatk.engine.walkers.RodWalker; -import org.broadinstitute.gatk.engine.walkers.Walker; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.lang.reflect.Method; -import java.util.Arrays; -import java.util.Collections; -import java.util.Set; - -public class GATKVCFUtilsUnitTest extends BaseTest { - public static class VCFHeaderTestWalker extends RodWalker { - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return null; } - public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { return value + sum; } - } - - public static class VCFHeaderTest2Walker extends VCFHeaderTestWalker {} - - @Test - public void testAddingVCFHeaderInfo() { - final VCFHeader header = new VCFHeader(); - - final Walker walker1 = new VCFHeaderTestWalker(); - final Walker walker2 = new VCFHeaderTest2Walker(); - - final GenomeAnalysisEngine testEngine1 = new GenomeAnalysisEngine(); - testEngine1.setWalker(walker1); - - final GenomeAnalysisEngine testEngine2 = new GenomeAnalysisEngine(); - testEngine2.setWalker(walker2); - - final VCFHeaderLine line1 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine1, Collections.EMPTY_LIST); - logger.warn(line1); - Assert.assertNotNull(line1); - Assert.assertEquals(line1.getKey(), GATKVCFUtils.GATK_COMMAND_LINE_KEY); - for ( final String field : Arrays.asList("Version", "ID", "Date", "CommandLineOptions")) - Assert.assertTrue(line1.toString().contains(field), "Couldn't find field " + field + " in " + line1.getValue()); - Assert.assertTrue(line1.toString().contains("ID=" + testEngine1.getWalkerName())); - - final VCFHeaderLine line2 = GATKVCFUtils.getCommandLineArgumentHeaderLine(testEngine2, Collections.EMPTY_LIST); - logger.warn(line2); - - header.addMetaDataLine(line1); - final Set lines1 = header.getMetaDataInInputOrder(); - Assert.assertTrue(lines1.contains(line1)); - - header.addMetaDataLine(line2); - final Set lines2 = header.getMetaDataInInputOrder(); - Assert.assertTrue(lines2.contains(line1)); - Assert.assertTrue(lines2.contains(line2)); - } - - private class IndexCreatorTest extends TestDataProvider { - private final GATKVCFIndexType type; - private final int parameter; - private final Class expectedClass; - private final Integer expectedDimension; - private final Method dimensionGetter; - - private IndexCreatorTest(GATKVCFIndexType type, int parameter, Class expectedClass, Integer expectedDimension, - String dimensionGetterName) { - super(IndexCreatorTest.class); - - this.type = type; - this.parameter = parameter; - this.expectedClass = expectedClass; - this.expectedDimension = expectedDimension; - try { - // Conditional matches testGetIndexCreator's if-statement - this.dimensionGetter = this.expectedDimension == null ? null : expectedClass.getDeclaredMethod(dimensionGetterName); - } catch (NoSuchMethodException e) { - throw new RuntimeException(e); - } - } - } - - @DataProvider(name = "indexCreator") - public Object[][] indexCreatorData() { - new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0, DynamicIndexCreator.class, null, null); - new IndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0, DynamicIndexCreator.class, null, null); - new IndexCreatorTest(GATKVCFIndexType.LINEAR, 100, LinearIndexCreator.class, 100, "getBinSize"); - new IndexCreatorTest(GATKVCFIndexType.INTERVAL, 200, IntervalIndexCreator.class, 200, "getFeaturesPerInterval"); - - return IndexCreatorTest.getTests(IndexCreatorTest.class); - } - - @Test(dataProvider = "indexCreator") - public void testGetIndexCreator(IndexCreatorTest spec) throws Exception{ - File dummy = new File(""); - IndexCreator ic = GATKVCFUtils.getIndexCreator(spec.type, spec.parameter, dummy); - Assert.assertEquals(ic.getClass(), spec.expectedClass, "Wrong IndexCreator type"); - if (spec.expectedDimension != null) { - Integer dimension = (int)spec.dimensionGetter.invoke(ic); - Assert.assertEquals(dimension, spec.expectedDimension, "Wrong dimension"); - } - } -} \ No newline at end of file diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java deleted file mode 100644 index feb10a7d4..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java +++ /dev/null @@ -1,1612 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.variant.variantcontext.*; -import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; -import org.broadinstitute.gatk.utils.*; -import org.broadinstitute.gatk.utils.collections.Pair; -import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -public class GATKVariantContextUtilsUnitTest extends BaseTest { - private final static boolean DEBUG = false; - - Allele Aref, T, C, G, Cref, ATC, ATCATC; - Allele ATCATCT; - Allele ATref; - Allele Anoref; - Allele GT; - - private GenomeLocParser genomeLocParser; - - @BeforeSuite - public void setup() throws IOException { - // alleles - Aref = Allele.create("A", true); - Cref = Allele.create("C", true); - T = Allele.create("T"); - C = Allele.create("C"); - G = Allele.create("G"); - ATC = Allele.create("ATC"); - ATCATC = Allele.create("ATCATC"); - ATCATCT = Allele.create("ATCATCT"); - ATref = Allele.create("AT",true); - Anoref = Allele.create("A",false); - GT = Allele.create("GT",false); - genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); - } - - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { - return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).PL(pls).make(); - } - - - private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError) { - return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).make(); - } - - private VariantContext makeVC(String source, List alleles) { - return makeVC(source, alleles, null, null); - } - - private VariantContext makeVC(String source, List alleles, Genotype... g1) { - return makeVC(source, alleles, Arrays.asList(g1)); - } - - private VariantContext makeVC(String source, List alleles, String filter) { - return makeVC(source, alleles, filter.equals(".") ? null : new HashSet(Arrays.asList(filter))); - } - - private VariantContext makeVC(String source, List alleles, Set filters) { - return makeVC(source, alleles, null, filters); - } - - private VariantContext makeVC(String source, List alleles, Collection genotypes) { - return makeVC(source, alleles, genotypes, null); - } - - private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { - int start = 10; - int stop = start + alleles.get(0).length() - 1; // alleles.contains(ATC) ? start + 3 : start; - return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make(); - } - - // -------------------------------------------------------------------------------- - // - // Test allele merging - // - // -------------------------------------------------------------------------------- - - private class MergeAllelesTest extends TestDataProvider { - List> inputs; - List expected; - - private MergeAllelesTest(List... arg) { - super(MergeAllelesTest.class); - LinkedList> all = new LinkedList<>(Arrays.asList(arg)); - expected = all.pollLast(); - inputs = all; - } - - public String toString() { - return String.format("MergeAllelesTest input=%s expected=%s", inputs, expected); - } - } - @DataProvider(name = "mergeAlleles") - public Object[][] mergeAllelesData() { - // first, do no harm - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref), - Arrays.asList(Aref)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, T), - Arrays.asList(Aref, T)); - - new MergeAllelesTest(Arrays.asList(Aref, C), - Arrays.asList(Aref, T), - Arrays.asList(Aref, C, T)); - - new MergeAllelesTest(Arrays.asList(Aref, T), - Arrays.asList(Aref, C), - Arrays.asList(Aref, T, C)); // in order of appearence - - new MergeAllelesTest(Arrays.asList(Aref, C, T), - Arrays.asList(Aref, C), - Arrays.asList(Aref, C, T)); - - new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); - - new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); - - new MergeAllelesTest(Arrays.asList(Aref, T, C), - Arrays.asList(Aref, C), - Arrays.asList(Aref, T, C)); // in order of appearence - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, ATC), - Arrays.asList(Aref, ATC)); - - new MergeAllelesTest(Arrays.asList(Aref), - Arrays.asList(Aref, ATC, ATCATC), - Arrays.asList(Aref, ATC, ATCATC)); - - // alleles in the order we see them - new MergeAllelesTest(Arrays.asList(Aref, ATCATC), - Arrays.asList(Aref, ATC, ATCATC), - Arrays.asList(Aref, ATCATC, ATC)); - - // same - new MergeAllelesTest(Arrays.asList(Aref, ATC), - Arrays.asList(Aref, ATCATC), - Arrays.asList(Aref, ATC, ATCATC)); - - new MergeAllelesTest(Arrays.asList(ATref, ATC, Anoref, G), - Arrays.asList(Aref, ATCATC, G), - Arrays.asList(ATref, ATC, Anoref, G, ATCATCT, GT)); - - return MergeAllelesTest.getTests(MergeAllelesTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeAlleles") - public void testMergeAlleles(MergeAllelesTest cfg) { - final List inputs = new ArrayList(); - - int i = 0; - for ( final List alleles : cfg.inputs ) { - final String name = "vcf" + ++i; - inputs.add(makeVC(name, alleles)); - } - - final List priority = vcs2priority(inputs); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - inputs, priority, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); - - Assert.assertEquals(merged.getAlleles().size(),cfg.expected.size()); - Assert.assertEquals(merged.getAlleles(), cfg.expected); - } - - // -------------------------------------------------------------------------------- - // - // Test rsID merging - // - // -------------------------------------------------------------------------------- - - private class SimpleMergeRSIDTest extends TestDataProvider { - List inputs; - String expected; - - private SimpleMergeRSIDTest(String... arg) { - super(SimpleMergeRSIDTest.class); - LinkedList allStrings = new LinkedList(Arrays.asList(arg)); - expected = allStrings.pollLast(); - inputs = allStrings; - } - - public String toString() { - return String.format("SimpleMergeRSIDTest vc=%s expected=%s", inputs, expected); - } - } - - @DataProvider(name = "simplemergersiddata") - public Object[][] createSimpleMergeRSIDData() { - new SimpleMergeRSIDTest(".", "."); - new SimpleMergeRSIDTest(".", ".", "."); - new SimpleMergeRSIDTest("rs1", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs1", "rs1"); - new SimpleMergeRSIDTest(".", "rs1", "rs1"); - new SimpleMergeRSIDTest("rs1", ".", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs1,rs2"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs1", "rs1,rs2"); // duplicates - new SimpleMergeRSIDTest("rs2", "rs1", "rs2,rs1"); - new SimpleMergeRSIDTest("rs2", "rs1", ".", "rs2,rs1"); - new SimpleMergeRSIDTest("rs2", ".", "rs1", "rs2,rs1"); - new SimpleMergeRSIDTest("rs1", ".", ".", "rs1"); - new SimpleMergeRSIDTest("rs1", "rs2", "rs3", "rs1,rs2,rs3"); - - return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "simplemergersiddata") - public void testRSIDMerge(SimpleMergeRSIDTest cfg) { - VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); - final List inputs = new ArrayList(); - - for ( final String id : cfg.inputs ) { - inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); - } - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - inputs, null, - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); - Assert.assertEquals(merged.getID(), cfg.expected); - } - - // -------------------------------------------------------------------------------- - // - // Test filtered merging - // - // -------------------------------------------------------------------------------- - - private class MergeFilteredTest extends TestDataProvider { - List inputs; - VariantContext expected; - String setExpected; - GATKVariantContextUtils.FilteredRecordMergeType type; - - - private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) { - this(name, input1, input2, expected, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); - } - - private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, GATKVariantContextUtils.FilteredRecordMergeType type, String setExpected) { - super(MergeFilteredTest.class, name); - LinkedList all = new LinkedList(Arrays.asList(input1, input2)); - this.expected = expected; - this.type = type; - inputs = all; - this.setExpected = setExpected; - } - - public String toString() { - return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); - } - } - - @DataProvider(name = "mergeFiltered") - public Object[][] mergeFilteredData() { - new MergeFilteredTest("AllPass", - makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("noFilters", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "."), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("oneFiltered", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("onePassOneFail", - makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("AllFiltered", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "FAIL"), - GATKVariantContextUtils.MERGE_FILTER_IN_ALL); - - // test ALL vs. ANY - new MergeFilteredTest("FailOneUnfiltered", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "."), - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - new MergeFilteredTest("OneFailAllUnfilteredArg", - makeVC("1", Arrays.asList(Aref, T), "FAIL"), - makeVC("2", Arrays.asList(Aref, T), "."), - makeVC("3", Arrays.asList(Aref, T), "FAIL"), - GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, - String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - // test excluding allele in filtered record - new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), "FAIL"), - makeVC("3", Arrays.asList(Aref, T), "."), - String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); - - // promotion of site from unfiltered to PASSES - new MergeFilteredTest("UnfilteredPlusPassIsPass", - makeVC("1", Arrays.asList(Aref, T), "."), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_INTERSECTION); - - new MergeFilteredTest("RefInAll", - makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - GATKVariantContextUtils.MERGE_REF_IN_ALL); - - new MergeFilteredTest("RefInOne", - makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), - makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), - "2"); - - return MergeFilteredTest.getTests(MergeFilteredTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeFiltered") - public void testMergeFiltered(MergeFilteredTest cfg) { - final List priority = vcs2priority(cfg.inputs); - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); - - // test alleles are equal - Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); - - // test set field - Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); - - // test filter field - Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); - } - - // -------------------------------------------------------------------------------- - // - // Test genotype merging - // - // -------------------------------------------------------------------------------- - - private class MergeGenotypesTest extends TestDataProvider { - List inputs; - VariantContext expected; - List priority; - - private MergeGenotypesTest(String name, String priority, VariantContext... arg) { - super(MergeGenotypesTest.class, name); - LinkedList all = new LinkedList(Arrays.asList(arg)); - this.expected = all.pollLast(); - inputs = all; - this.priority = Arrays.asList(priority.split(",")); - } - - public String toString() { - return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); - } - } - - @DataProvider(name = "mergeGenotypes") - public Object[][] mergeGenotypesData() { - new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); - - new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); - - new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); - - new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); - - new MergeGenotypesTest("PreserveNoCall", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); - - new MergeGenotypesTest("PerserveAlleles", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), - makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); - - new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); - - new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); - - // - // merging genothpes with PLs - // - - // first, do no harm - new MergeGenotypesTest("OrderedPLs", "1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles", "1", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); - - // first, do no harm - new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), - makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); - - new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", - makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), - makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); - - new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", - makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), - makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), - // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); - - new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", - makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), - makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), - // no likelihoods on result since type changes to mixed multiallelic - makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); - - return MergeGenotypesTest.getTests(MergeGenotypesTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "mergeGenotypes") - public void testMergeGenotypes(MergeGenotypesTest cfg) { - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); - - // test alleles are equal - Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); - - // test genotypes - assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); - } - - // necessary to not overload equals for genotypes - private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { - if (actual == expected) { - return; - } - - if (actual == null || expected == null) { - Assert.fail("Maps not equal: expected: " + expected + " and actual: " + actual); - } - - if (actual.size() != expected.size()) { - Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size()); - } - - for (Genotype value : actual) { - Genotype expectedValue = expected.get(value.getSampleName()); - - Assert.assertEquals(value.getAlleles(), expectedValue.getAlleles(), "Alleles in Genotype aren't equal"); - Assert.assertEquals(value.getGQ(), expectedValue.getGQ(), "GQ values aren't equal"); - Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); - if ( value.hasLikelihoods() ) - Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); - } - } - - @Test(enabled = !DEBUG) - public void testMergeGenotypesUniquify() { - final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); - final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); - - // test genotypes - Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); - } - -// TODO: remove after testing -// @Test(expectedExceptions = IllegalStateException.class) -// public void testMergeGenotypesRequireUnique() { -// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); -// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); -// -// final VariantContext merged = VariantContextUtils.simpleMerge( -// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, -// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); -// } - - // -------------------------------------------------------------------------------- - // - // Misc. tests - // - // -------------------------------------------------------------------------------- - - @Test(enabled = !DEBUG) - public void testAnnotationSet() { - for ( final boolean annotate : Arrays.asList(true, false)) { - for ( final String set : Arrays.asList("set", "combine", "x")) { - final List priority = Arrays.asList("1", "2"); - VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); - VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); - - final VariantContext merged = GATKVariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); - - if ( annotate ) - Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); - else - Assert.assertFalse(merged.hasAttribute(set)); - } - } - } - - private static final List vcs2priority(final Collection vcs) { - final List priority = new ArrayList<>(); - - for ( final VariantContext vc : vcs ) { - priority.add(vc.getSource()); - } - - return priority; - } - - // -------------------------------------------------------------------------------- - // - // basic allele clipping test - // - // -------------------------------------------------------------------------------- - - private class ReverseClippingPositionTestProvider extends TestDataProvider { - final String ref; - final List alleles = new ArrayList(); - final int expectedClip; - - private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { - super(ReverseClippingPositionTestProvider.class); - this.ref = ref; - for ( final String allele : alleles ) - this.alleles.add(Allele.create(allele)); - this.expectedClip = expectedClip; - } - - @Override - public String toString() { - return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); - } - } - - @DataProvider(name = "ReverseClippingPositionTestProvider") - public Object[][] makeReverseClippingPositionTestProvider() { - // pair clipping - new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); - new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); - new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); - new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele - - // triplets - new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); - new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go - new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go - - return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); - } - - @Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider") - public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { - int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes()); - Assert.assertEquals(result, cfg.expectedClip); - } - - - // -------------------------------------------------------------------------------- - // - // test splitting into bi-allelics - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "SplitBiallelics") - public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { - List tests = new ArrayList(); - - final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); - - // biallelic -> biallelic - tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); - - // monos -> monos - root.alleles(Arrays.asList(Aref)); - tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); - - root.alleles(Arrays.asList(Aref, C, T)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Aref, C)).make(), - root.alleles(Arrays.asList(Aref, T)).make())}); - - root.alleles(Arrays.asList(Aref, C, T, G)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Aref, C)).make(), - root.alleles(Arrays.asList(Aref, T)).make(), - root.alleles(Arrays.asList(Aref, G)).make())}); - - final Allele C = Allele.create("C"); - final Allele CA = Allele.create("CA"); - final Allele CAA = Allele.create("CAA"); - final Allele CAAAA = Allele.create("CAAAA"); - final Allele CAAAAA = Allele.create("CAAAAA"); - final Allele Cref = Allele.create("C", true); - final Allele CAref = Allele.create("CA", true); - final Allele CAAref = Allele.create("CAA", true); - final Allele CAAAref = Allele.create("CAAA", true); - - root.alleles(Arrays.asList(Cref, CA, CAA)); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Cref, CA)).make(), - root.alleles(Arrays.asList(Cref, CAA)).make())}); - - root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(CAAref, C)).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); - - root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(CAAAref, C)).make(), - root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); - - root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); - tests.add(new Object[]{root.make(), - Arrays.asList( - root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), - root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), - root.alleles(Arrays.asList(CAref, C)).stop(11).make(), - root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); - - final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); - final Allele twoCopies = Allele.create("GTTTTATTTTA", true); - final Allele zeroCopies = Allele.create("G", false); - final Allele oneCopies = Allele.create("GTTTTA", false); - tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), - Arrays.asList( - root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), - root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics") - public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { - final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); - Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); - for ( int i = 0; i < biallelics.size(); i++ ) { - final VariantContext actual = biallelics.get(i); - final VariantContext expected = expectedBiallelics.get(i); - assertVariantContextsAreEqual(actual, expected); - } - } - - @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") - public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { - final List genotypes = new ArrayList(); - - int sampleI = 0; - for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { - genotypes.add(GenotypeBuilder.create("sample" + sampleI++, alleles)); - } - genotypes.add(GenotypeBuilder.createMissing("missing", 2)); - - final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); - - final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); - for ( int i = 0; i < biallelics.size(); i++ ) { - final VariantContext actual = biallelics.get(i); - Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples - - for ( final Genotype inputGenotype : genotypes ) { - final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); - Assert.assertNotNull(actualGenotype); - if ( ! vc.isVariant() || vc.isBiallelic() ) - Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); - else - Assert.assertTrue(actualGenotype.isNoCall()); - } - } - } - - // -------------------------------------------------------------------------------- - // - // Test repeats - // - // -------------------------------------------------------------------------------- - - private class RepeatDetectorTest extends TestDataProvider { - String ref; - boolean isTrueRepeat; - VariantContext vc; - - private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { - super(RepeatDetectorTest.class); - this.isTrueRepeat = isTrueRepeat; - this.ref = ref; - - List alleles = new LinkedList(); - final Allele refAllele = Allele.create(refAlleleString, true); - alleles.add(refAllele); - for ( final String altString: altAlleleStrings) { - final Allele alt = Allele.create(altString, false); - alleles.add(alt); - } - - VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); - this.vc = builder.make(); - } - - public String toString() { - return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); - } - } - - @DataProvider(name = "RepeatDetectorTest") - public Object[][] makeRepeatDetectorTest() { - new RepeatDetectorTest(true, "NAAC", "N", "NA"); - new RepeatDetectorTest(true, "NAAC", "NA", "N"); - new RepeatDetectorTest(false, "NAAC", "NAA", "N"); - new RepeatDetectorTest(false, "NAAC", "N", "NC"); - new RepeatDetectorTest(false, "AAC", "A", "C"); - - // running out of ref bases => false - new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); - - // complex repeats - new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); - new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); - - // multi-allelic - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); - new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false - new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false - - return RepeatDetectorTest.getTests(RepeatDetectorTest.class); - } - - @Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest") - public void testRepeatDetectorTest(RepeatDetectorTest cfg) { - - // test alleles are equal - Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); - } - - @Test(enabled = !DEBUG) - public void testRepeatAllele() { - Allele nullR = Allele.create("A", true); - Allele nullA = Allele.create("A", false); - Allele atc = Allele.create("AATC", false); - Allele atcatc = Allele.create("AATCATC", false); - Allele ccccR = Allele.create("ACCCC", true); - Allele cc = Allele.create("ACC", false); - Allele cccccc = Allele.create("ACCCCCC", false); - Allele gagaR = Allele.create("AGAGA", true); - Allele gagagaga = Allele.create("AGAGAGAGA", false); - - // - / ATC [ref] from 20-22 - String delLoc = "chr1"; - int delLocStart = 20; - int delLocStop = 22; - - // - [ref] / ATC from 20-20 - String insLoc = "chr1"; - int insLocStart = 20; - int insLocStop = 20; - - Pair,byte[]> result; - byte[] refBytes = "TATCATCATCGGA".getBytes(); - - Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); - Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); - Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("T".getBytes(), "T".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); - Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); - - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); - Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); - - - // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 - VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,3); - - // ATC*,A,ATCATC - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],3); - Assert.assertEquals(result.getFirst().toArray()[1],2); - Assert.assertEquals(result.getFirst().toArray()[2],4); - Assert.assertEquals(result.getSecond().length,3); - - // simple non-tandem deletion: CCCC*, - - refBytes = "TCCCCCCCCATG".getBytes(); - vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],8); - Assert.assertEquals(result.getFirst().toArray()[1],4); - Assert.assertEquals(result.getSecond().length,1); - - // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 - refBytes = "TCCCCCCCAGAGAGAG".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],7); - Assert.assertEquals(result.getFirst().toArray()[1],5); - Assert.assertEquals(result.getFirst().toArray()[2],3); - Assert.assertEquals(result.getFirst().toArray()[3],9); - Assert.assertEquals(result.getSecond().length,1); - - // GAGA*,-,GAGAGAGA - refBytes = "TGAGAGAGAGATTT".getBytes(); - vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); - result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); - Assert.assertEquals(result.getFirst().toArray()[0],5); - Assert.assertEquals(result.getFirst().toArray()[1],3); - Assert.assertEquals(result.getFirst().toArray()[2],7); - Assert.assertEquals(result.getSecond().length,2); - - } - - // -------------------------------------------------------------------------------- - // - // test forward clipping - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "ForwardClippingData") - public Object[][] makeForwardClippingData() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList("A"), -1}); - tests.add(new Object[]{Arrays.asList(""), -1}); - tests.add(new Object[]{Arrays.asList("A", "C"), -1}); - tests.add(new Object[]{Arrays.asList("AC", "C"), -1}); - tests.add(new Object[]{Arrays.asList("A", "G"), -1}); - tests.add(new Object[]{Arrays.asList("A", "T"), -1}); - tests.add(new Object[]{Arrays.asList("GT", "CA"), -1}); - tests.add(new Object[]{Arrays.asList("GT", "CT"), -1}); - tests.add(new Object[]{Arrays.asList("ACC", "AC"), 0}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), 2}); - tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), 0}); - tests.add(new Object[]{Arrays.asList("A", ""), -1}); - for ( int len = 0; len < 50; len++ ) - tests.add(new Object[]{Arrays.asList("A" + new String(Utils.dupBytes((byte)'C', len)), "C"), -1}); - - tests.add(new Object[]{Arrays.asList("A", "T", "C"), -1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), 0}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "A"), -1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("AC", "AC", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), 0}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), 1}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), 1}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "ForwardClippingData") - public void testForwardClipping(final List alleleStrings, final int expectedClip) { - final List alleles = new LinkedList(); - for ( final String alleleString : alleleStrings ) - alleles.add(Allele.create(alleleString)); - - for ( final List myAlleles : Utils.makePermutations(alleles, alleles.size(), false)) { - final int actual = GATKVariantContextUtils.computeForwardClipping(myAlleles); - Assert.assertEquals(actual, expectedClip); - } - } - - @DataProvider(name = "ClipAlleleTest") - public Object[][] makeClipAlleleTest() { - List tests = new ArrayList(); - - // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{Arrays.asList("ACC", "AC"), Arrays.asList("AC", "A"), 0}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), Arrays.asList("GC", "G"), 2}); - tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), Arrays.asList("C", "A"), 3}); - tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), Arrays.asList("AC", "A"), 0}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), Arrays.asList("T", "C", "G"), 1}); - tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), Arrays.asList("T", "C", "CG"), 1}); - tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), Arrays.asList("C", "CT", "CG"), 1}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), Arrays.asList("G", "GT", "GTA"), 2}); - tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), Arrays.asList("G", "GT", "GCA"), 2}); - - // trims from left and right - tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); - tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); - tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest") - public void testClipAlleles(final List alleleStrings, final List expected, final int numLeftClipped) { - final int start = 10; - final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings); - final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true); - - Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped); - for ( int i = 0; i < unclipped.getAlleles().size(); i++ ) { - final Allele trimmed = clipped.getAlleles().get(i); - Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); - } - } - - // -------------------------------------------------------------------------------- - // - // test primitive allele splitting - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "PrimitiveAlleleSplittingData") - public Object[][] makePrimitiveAlleleSplittingData() { - List tests = new ArrayList<>(); - - // no split - tests.add(new Object[]{"A", "C", 0, null}); - tests.add(new Object[]{"A", "AC", 0, null}); - tests.add(new Object[]{"AC", "A", 0, null}); - - // one split - tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); - tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); - tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); - - // two splits - tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); - tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); - tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); - - // three splits - tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData") - public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { - - final int start = 10; - final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); - - final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); - - if ( expectedSplit > 0 ) { - Assert.assertEquals(result.size(), expectedSplit); - for ( int i = 0; i < variantPositions.size(); i++ ) { - Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); - } - } else { - Assert.assertEquals(result.size(), 1); - Assert.assertEquals(vc, result.get(0)); - } - } - - // -------------------------------------------------------------------------------- - // - // test allele remapping - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "AlleleRemappingData") - public Object[][] makeAlleleRemappingData() { - List tests = new ArrayList<>(); - - final Allele originalBase1 = Allele.create((byte)'A'); - final Allele originalBase2 = Allele.create((byte)'T'); - - for ( final byte base1 : BaseUtils.BASES ) { - for ( final byte base2 : BaseUtils.BASES ) { - for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { - Map map = new HashMap<>(2); - map.put(originalBase1, Allele.create(base1)); - map.put(originalBase2, Allele.create(base2)); - - tests.add(new Object[]{map, numGenotypes}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData") - public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { - - final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); - - final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); - - final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); - - for ( int i = 0; i < numGenotypes; i++ ) { - final Genotype originalG = originalGC.get(String.format("%d", i)); - final Genotype remappedG = remappedGC.get(String.format("%d", i)); - - Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); - for ( int j = 0; j < originalG.getAlleles().size(); j++ ) - Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); - } - } - - private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { - GenomeAnalysisEngine.resetRandomGenerator(); - final Random random = GenomeAnalysisEngine.getRandomGenerator(); - - final GenotypesContext gc = GenotypesContext.create(); - for ( int i = 0; i < numGenotypes; i++ ) { - // choose alleles at random - final List myAlleles = new ArrayList(); - myAlleles.add(alleles.get(random.nextInt(2))); - myAlleles.add(alleles.get(random.nextInt(2))); - - final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); - gc.add(g); - } - - return gc; - } - - // -------------------------------------------------------------------------------- - // - // Test subsetDiploidAlleles - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "subsetDiploidAllelesData") - public Object[][] makesubsetDiploidAllelesData() { - List tests = new ArrayList<>(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - - final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); - - final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); - final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); - final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); - final double[] uninformative = new double[]{0, 0, 0}; - - final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); - - // make sure we don't screw up the simple case - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); - - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).make())}); - - // uninformative test case - final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).PL(uninformative).GQ(0).make(); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noPL().noGQ().make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); - - // actually subsetting down from multiple alt values - final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; - final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; - final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; - final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; - final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG - final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homRef3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefC3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homC3AllelesPL).make()).make(), - AC, - Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).GQ(100).make())}); - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).GQ(200).make())}); - - // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetCG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).GQ(200).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homG3AllelesPL).make()).make(), - AG, - Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).GQ(200).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "subsetDiploidAllelesData") - public void testsubsetDiploidAllelesData(final VariantContext inputVC, - final List allelesToUse, - final List expectedGenotypes) { - final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); - - Assert.assertEquals(actual.size(), expectedGenotypes.size()); - for ( final Genotype expected : expectedGenotypes ) { - final Genotype actualGT = actual.get(expected.getSampleName()); - Assert.assertNotNull(actualGT); - assertGenotypesAreEqual(actualGT, expected); - } - } - - @DataProvider(name = "UpdateGenotypeAfterSubsettingData") - public Object[][] makeUpdateGenotypeAfterSubsettingData() { - List tests = new ArrayList(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); - - final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; - final double[] hetPL = new double[]{0.09, 0.9, 0.01}; - final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; - final double[] uninformative = new double[]{0.33, 0.33, 0.33}; - final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); - - for ( final List alleles : allSubsetAlleles ) { - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); - } - } - - for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); -// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); - } - - for ( final double[] pls : allPLs ) { - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); - - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); - tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); - } - - return tests.toArray(new Object[][]{}); - } - - @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") - public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, - final double[] likelihoods, - final List originalGT, - final List allelesToUse, - final List expectedAlleles) { - final GenotypeBuilder gb = new GenotypeBuilder("test"); - final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); - GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); - final Genotype g = gb.make(); - Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); - } - - @Test(enabled = !DEBUG) - public void testSubsetToRef() { - final Map tests = new LinkedHashMap<>(); - - for ( final List alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) { - for ( final String name : Arrays.asList("test1", "test2") ) { - final GenotypeBuilder builder = new GenotypeBuilder(name, alleles); - builder.DP(10); - builder.GQ(30); - builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); - builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); - final List refs = Collections.nCopies(alleles.size(), Aref); - tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); - } - } - - for ( final int n : Arrays.asList(1, 2, 3) ) { - for ( final List genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) { - final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make(); - final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2); - - Assert.assertEquals(gc.size(), genotypes.size()); - for ( int i = 0; i < genotypes.size(); i++ ) { -// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); - assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); - } - } - } - } - - // -------------------------------------------------------------------------------- - // - // Test updatePLsAndAD - // - // -------------------------------------------------------------------------------- - - @DataProvider(name = "updatePLsAndADData") - public Object[][] makeUpdatePLsAndADData() { - List tests = new ArrayList<>(); - - final Allele A = Allele.create("A", true); - final Allele C = Allele.create("C"); - final Allele G = Allele.create("G"); - - final List AA = Arrays.asList(A,A); - final List AC = Arrays.asList(A,C); - final List CC = Arrays.asList(C,C); - final List AG = Arrays.asList(A,G); - final List CG = Arrays.asList(C,G); - final List GG = Arrays.asList(G,G); - final List ACG = Arrays.asList(A,C,G); - - final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); - - final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); - final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); - final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); - final double[] uninformative = new double[]{0, 0, 0}; - - final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); - - // make sure we don't screw up the simple case where no selection happens - final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); - final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); - final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); - - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(ccGT).make())}); - - // uninformative test cases - final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); - final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); - tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); - - // actually subsetting down from multiple alt values - final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; - final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; - final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; - final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; - final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG - final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG - - final int[] homRef3AllelesAD = new int[]{20, 0, 1}; - final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; - final int[] homC3AllelesAD = new int[]{0, 20, 1}; - final int[] hetRefG3AllelesAD = new int[]{10, 0, 11}; - final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG - final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AC).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); - - tests.add(new Object[]{ - new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), - new VariantContextBuilder(vcBase).alleles(AG).make(), - Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "updatePLsAndADData") - public void testUpdatePLsAndADData(final VariantContext originalVC, - final VariantContext selectedVC, - final List expectedGenotypes) { - final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); - final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); - - Assert.assertEquals(actual.size(), expectedGenotypes.size()); - for ( final Genotype expected : expectedGenotypes ) { - final Genotype actualGT = actual.get(expected.getSampleName()); - Assert.assertNotNull(actualGT); - assertGenotypesAreEqual(actualGT, expected); - } - } - - // -------------------------------------------------------------------------------- - // - // Test methods for merging reference confidence VCs - // - // -------------------------------------------------------------------------------- - - - @Test(dataProvider = "indexOfAlleleData") - public void testIndexOfAllele(final Allele reference, final List altAlleles, final List otherAlleles) { - final List alleles = new ArrayList<>(altAlleles.size() + 1); - alleles.add(reference); - alleles.addAll(altAlleles); - final VariantContext vc = makeVC("Source", alleles); - - for (int i = 0; i < alleles.size(); i++) { - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,true),i); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,true),i); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,false),i); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,false),i); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,true),i); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,false),-1); - if (i == 0) { - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,true),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,true),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,false),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,false),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),true),false,true,true),i); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),false),false,true,true),-1); - } else { - Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),true),i - 1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),false), i - 1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),true),i-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),false),-1); - } - } - - for (final Allele other : otherAlleles) { - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, true, true, true), -1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,true),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,true,false),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,false),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,true),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,false,true),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,false),-1); - Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, false, false, false),-1); - } - } - - @DataProvider(name = "indexOfAlleleData") - public Iterator indexOfAlleleData() { - - final Allele[] ALTERNATIVE_ALLELES = new Allele[] { T, C, G, ATC, ATCATC}; - - final int lastMask = 0x1F; - - return new Iterator() { - - int nextMask = 0; - - @Override - public boolean hasNext() { - return nextMask <= lastMask; - } - - @Override - public Object[] next() { - - int mask = nextMask++; - final List includedAlleles = new ArrayList<>(5); - final List excludedAlleles = new ArrayList<>(5); - for (int i = 0; i < ALTERNATIVE_ALLELES.length; i++) { - ((mask & 1) == 1 ? includedAlleles : excludedAlleles).add(ALTERNATIVE_ALLELES[i]); - mask >>= 1; - } - return new Object[] { Aref , includedAlleles, excludedAlleles}; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - - @Test(dataProvider="overlapWithData") - public void testOverlapsWith(final VariantContext vc, final GenomeLoc genomeLoc) { - final boolean expected; - - if (genomeLoc.isUnmapped()) - expected = false; - else if (vc.getStart() > genomeLoc.getStop()) - expected = false; - else if (vc.getEnd() < genomeLoc.getStart()) - expected = false; - else if (!vc.getChr().equals(genomeLoc.getContig())) - expected = false; - else - expected = true; - - Assert.assertEquals(GATKVariantContextUtils.overlapsRegion(vc, genomeLoc), expected); - } - - - private final String[] OVERLAP_WITH_CHROMOSOMES = { "chr1", "chr20" }; - private final int[] OVERLAP_WITH_EVENT_SIZES = { -10, -1, 0, 1, 10 }; // 0 == SNP , -X xbp deletion, +X xbp insertion. - private final int[] OVERLAP_WITH_EVENT_STARTS = { 10000000, 10000001, - 10000005, 10000010, - 10000009, 10000011, - 20000000 }; - - @DataProvider(name="overlapWithData") - public Object[][] overlapWithData() { - - final int totalLocations = OVERLAP_WITH_CHROMOSOMES.length * OVERLAP_WITH_EVENT_SIZES.length * OVERLAP_WITH_EVENT_STARTS.length + 1; - final int totalEvents = OVERLAP_WITH_CHROMOSOMES.length * OVERLAP_WITH_EVENT_SIZES.length * OVERLAP_WITH_EVENT_STARTS.length; - final GenomeLoc[] locs = new GenomeLoc[totalLocations]; - final VariantContext[] events = new VariantContext[totalEvents]; - - generateAllLocationsAndVariantContextCombinations(OVERLAP_WITH_CHROMOSOMES, OVERLAP_WITH_EVENT_SIZES, - OVERLAP_WITH_EVENT_STARTS, locs, events); - - return generateAllParameterCombinationsForOverlapWithData(locs, events); - } - - private Object[][] generateAllParameterCombinationsForOverlapWithData(GenomeLoc[] locs, VariantContext[] events) { - final List result = new LinkedList<>(); - for (final GenomeLoc loc : locs) - for (final VariantContext event : events) - result.add(new Object[] { event , loc }); - - return result.toArray(new Object[result.size()][]); - } - - private void generateAllLocationsAndVariantContextCombinations(final String[] chrs, final int[] eventSizes, - final int[] eventStarts, final GenomeLoc[] locs, - final VariantContext[] events) { - int nextIndex = 0; - for (final String chr : chrs ) - for (final int size : eventSizes ) - for (final int starts : eventStarts ) { - locs[nextIndex] = genomeLocParser.createGenomeLoc(chr,starts,starts + Math.max(0,size)); - events[nextIndex++] = new VariantContextBuilder().source("test").loc(chr,starts,starts + Math.max(0,size)).alleles(Arrays.asList( - Allele.create(randomBases(size <= 0 ? 1 : size + 1, true), true), Allele.create(randomBases(size < 0 ? -size + 1 : 1, false), false))).make(); - } - - locs[nextIndex++] = GenomeLoc.UNMAPPED; - } - - @Test(dataProvider = "totalPloidyData") - public void testTotalPloidy(final int[] ploidies, final int defaultPloidy, final int expected) { - final Genotype[] genotypes = new Genotype[ploidies.length]; - final List vcAlleles = Arrays.asList(Aref,C); - for (int i = 0; i < genotypes.length; i++) - genotypes[i] = new GenotypeBuilder().alleles(GATKVariantContextUtils.noCallAlleles(ploidies[i])).make(); - final VariantContext vc = new VariantContextBuilder().chr("seq1").genotypes(genotypes).alleles(vcAlleles).make(); - Assert.assertEquals(GATKVariantContextUtils.totalPloidy(vc,defaultPloidy),expected," " + defaultPloidy + " " + Arrays.toString(ploidies)); - } - - @DataProvider(name="totalPloidyData") - public Object[][] totalPloidyData() { - final Random rdn = GenomeAnalysisEngine.getRandomGenerator(); - final List resultList = new ArrayList<>(); - for (int i = 0; i < 100; i++) { - final int sampleCount = rdn.nextInt(10); - - int expected = 0; - final int defaultPloidy = rdn.nextInt(10) + 1; - final int[] plodies = new int[sampleCount]; - for (int j = 0; j < sampleCount; j++) { - plodies[j] = rdn.nextInt(10); - expected += plodies[j] == 0 ? defaultPloidy : plodies[j]; - } - resultList.add(new Object[] { plodies, defaultPloidy, expected }); - } - return resultList.toArray(new Object[100][]); - } - - private byte[] randomBases(final int length, final boolean reference) { - final byte[] bases = new byte[length]; - bases[0] = (byte) (reference ? 'A' : 'C'); - BaseUtils.fillWithRandomBases(bases, 1, bases.length); - return bases; - } -} - diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/VCFIntegrationTest.java b/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/VCFIntegrationTest.java deleted file mode 100644 index 4a087025a..000000000 --- a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/VCFIntegrationTest.java +++ /dev/null @@ -1,377 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package org.broadinstitute.gatk.utils.variant; - -import htsjdk.tribble.AbstractFeatureReader; -import htsjdk.tribble.Tribble; -import htsjdk.tribble.index.AbstractIndex; -import htsjdk.tribble.index.ChrIndex; -import htsjdk.tribble.index.Index; -import htsjdk.tribble.index.IndexFactory; -import htsjdk.tribble.index.interval.IntervalTreeIndex; -import htsjdk.tribble.index.linear.LinearIndex; -import htsjdk.tribble.index.tabix.TabixIndex; -import htsjdk.tribble.util.TabixUtils; -import org.broadinstitute.gatk.utils.BaseTest; -import org.broadinstitute.gatk.engine.walkers.WalkerTest; -import htsjdk.variant.vcf.VCFCodec; -import org.testng.Assert; -import org.testng.TestException; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.lang.reflect.Field; -import java.util.Arrays; -import java.util.LinkedHashMap; -import java.util.List; - -public class VCFIntegrationTest extends WalkerTest { - - @Test(enabled = true) - public void testReadingAndWritingWitHNoChanges() { - - String md5ofInputVCF = "d991abe6c6a7a778a60a667717903be0"; - String testVCF = privateTestDir + "vcf4.1.example.vcf"; - - String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; - - String test1 = baseCommand + "-T VariantAnnotator --variant " + testVCF + " -L " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF)); - List result = executeTest("Test Variant Annotator with no changes", spec1).getFirst(); - - String test2 = baseCommand + "-T VariantsToVCF --variant " + result.get(0).getAbsolutePath(); - WalkerTestSpec spec2 = new WalkerTestSpec(test2, 1, Arrays.asList(md5ofInputVCF)); - executeTest("Test Variants To VCF from new output", spec2); - } - - @Test(enabled = true) - public void testReadingAndWritingBreakpointAlleles() { - String testVCF = privateTestDir + "breakpoint-example.vcf"; - - String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; - - String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("13329ba7360a8beb3afc02569e5a20c4")); - executeTest("Test reading and writing breakpoint VCF", spec1); - } - - @Test(enabled = true) - public void testReadingLowerCaseBases() { - String testVCF = privateTestDir + "lowercaseBases.vcf"; - - String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; - - String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("e0e308a25e56bde1c664139bb44ed19d")); - executeTest("Test reading VCF with lower-case bases", spec1); - } - - @Test(enabled = true) - public void testReadingAndWriting1000GSVs() { - String testVCF = privateTestDir + "1000G_SVs.chr1.vcf"; - - String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; - - String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("bdab26dd7648a806dbab01f64db2bdab")); - executeTest("Test reading and writing 1000G Phase I SVs", spec1); - } - - @Test - public void testReadingAndWritingSamtools() { - String testVCF = privateTestDir + "samtools.vcf"; - - String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; - - String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("38697c195e7abf18d95dcc16c8e6d284")); - executeTest("Test reading and writing samtools vcf", spec1); - } - - @Test - public void testWritingSamtoolsWExBCFExample() { - String testVCF = privateTestDir + "ex2.vcf"; - String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; - String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("e8f721ce81e4fdadba13c5291027057f")); - executeTest("Test writing samtools WEx BCF example", spec1); - } - - @Test(enabled = true) - public void testReadingSamtoolsWExBCFExample() { - String testVCF = privateTestDir + "ex2.bcf"; - String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; - String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("0439e2b4ccc63bb4ba7c283cd9ab1b25")); - executeTest("Test reading samtools WEx BCF example", spec1); - } - - // - // - // Tests to ensure that -U LENIENT_VCF_PROCESS - // - // - - @Test - public void testFailingOnVCFWithoutHeaders() { - runVCFWithoutHeaders("", "", IllegalStateException.class, false); - } - - @Test - public void testPassingOnVCFWithoutHeadersWithLenientProcessing() { - runVCFWithoutHeaders("-U LENIENT_VCF_PROCESSING", "6de8cb7457154dd355aa55befb943f88", null, true); - } - - private void runVCFWithoutHeaders(final String moreArgs, final String expectedMD5, final Class expectedException, final boolean disableBCF) { - final String testVCF = privateTestDir + "vcfexample2.noHeader.vcf"; - final String baseCommand = "-R " + b37KGReference - + " --no_cmdline_in_header -o %s " - + "-T VariantsToVCF -V " + testVCF + " " + moreArgs; - WalkerTestSpec spec1 = expectedException != null - ? new WalkerTestSpec(baseCommand, 1, expectedException) - : new WalkerTestSpec(baseCommand, 1, Arrays.asList(expectedMD5)); - if ( disableBCF ) - spec1.disableShadowBCF(); - executeTest("Test reading VCF without header lines with additional args " + moreArgs, spec1); - } - - // - // - // IndexCreator tests - // - // - - private class VCFIndexCreatorTest extends TestDataProvider { - private final GATKVCFIndexType type; - private final int parameter; - - private VCFIndexCreatorTest(GATKVCFIndexType type, int parameter) { - super(VCFIndexCreatorTest.class); - - this.type = type; - this.parameter = parameter; - } - - public String toString() { - return String.format("Index Type %s, Index Parameter %s", type, parameter); - } - - public Index getIndex(final File vcfFile) { - switch (type) { - case DYNAMIC_SEEK : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - case DYNAMIC_SIZE : return IndexFactory.createDynamicIndex(vcfFile, new VCFCodec(), IndexFactory.IndexBalanceApproach.FOR_SIZE); - case LINEAR : return IndexFactory.createLinearIndex(vcfFile, new VCFCodec(), parameter); - case INTERVAL : return IndexFactory.createIntervalIndex(vcfFile, new VCFCodec(), parameter); - default : throw new TestException("Invalid index type"); - } - } - } - - @DataProvider(name = "IndexDataProvider") - public Object[][] indexCreatorData() { - new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SEEK, 0); - new VCFIndexCreatorTest(GATKVCFIndexType.DYNAMIC_SIZE, 0); - new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 100); - new VCFIndexCreatorTest(GATKVCFIndexType.LINEAR, 10000); - new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 20); - new VCFIndexCreatorTest(GATKVCFIndexType.INTERVAL, 2000); - - return TestDataProvider.getTests(VCFIndexCreatorTest.class); - } - - @Test(dataProvider = "IndexDataProvider") - public void testVCFIndexCreation(VCFIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException { - - final String commandLine = " -T SelectVariants" + - " -R " + b37KGReference + - " --no_cmdline_in_header" + - " -L 20" + - " -V " + b37_NA12878_OMNI + - " --variant_index_type " + testSpec.type + - " --variant_index_parameter " + testSpec.parameter + - " -o %s "; - final String name = "testVCFIndexCreation: " + testSpec.toString(); - - final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("")); - spec.disableShadowBCF(); - - File outVCF = executeTest(name, spec).first.get(0); - File outIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION); - - final Index actualIndex = IndexFactory.loadIndex(outIdx.getAbsolutePath()); - final Index expectedIndex = testSpec.getIndex(outVCF); - - if (testSpec.type.equals("LINEAR")) - Assert.assertTrue(actualIndex instanceof LinearIndex, "Index is not a LinearIndex"); - else if (testSpec.type.equals("INTERVAL")) - Assert.assertTrue(actualIndex instanceof IntervalTreeIndex, "Index is not a IntervalTreeIndex"); - // dynamic indices ultimately resolve to one of LinearIndex or IntervalTreeIndex - - Assert.assertTrue(equivalentAbstractIndices((AbstractIndex)actualIndex, (AbstractIndex)expectedIndex), "Indices are not equivalent"); - - if (actualIndex instanceof LinearIndex && expectedIndex instanceof LinearIndex) { - Assert.assertTrue(equivalentLinearIndices((LinearIndex)actualIndex, (LinearIndex)expectedIndex, "20"), "Linear indices are not equivalent"); - } - else if (actualIndex instanceof IntervalTreeIndex && expectedIndex instanceof IntervalTreeIndex) { - Assert.assertTrue(equivalentIntervalIndices((IntervalTreeIndex)actualIndex, (IntervalTreeIndex)expectedIndex, "20"), "Interval indices are not equivalent"); - } - else { - Assert.fail("Indices are not of the same type"); - } - } - - private static boolean equivalentAbstractIndices(AbstractIndex thisIndex, AbstractIndex otherIndex){ - return thisIndex.getVersion() == otherIndex.getVersion() && - thisIndex.getIndexedFile().equals(otherIndex.getIndexedFile()) && - thisIndex.getIndexedFileSize() == otherIndex.getIndexedFileSize() && - thisIndex.getIndexedFileMD5().equals(otherIndex.getIndexedFileMD5()) && - thisIndex.getFlags() == otherIndex.getFlags(); - } - - private static boolean equivalentLinearIndices(LinearIndex thisIndex, LinearIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { - htsjdk.tribble.index.linear.LinearIndex.ChrIndex thisChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(thisIndex, chr); - htsjdk.tribble.index.linear.LinearIndex.ChrIndex otherChr = (htsjdk.tribble.index.linear.LinearIndex.ChrIndex)getChrIndex(otherIndex, chr); - - return thisChr.getName().equals(otherChr.getName()) && - //thisChr.getTotalSize() == otherChr.getTotalSize() && TODO: why does this differ? - thisChr.getNFeatures() == otherChr.getNFeatures() && - thisChr.getNBlocks() == otherChr.getNBlocks(); - } - - private static boolean equivalentIntervalIndices(IntervalTreeIndex thisIndex, IntervalTreeIndex otherIndex, String chr) throws NoSuchFieldException, IllegalAccessException { - htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex thisChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(thisIndex, chr); - htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex otherChr = (htsjdk.tribble.index.interval.IntervalTreeIndex.ChrIndex)getChrIndex(otherIndex, chr); - - // TODO: compare trees? - return thisChr.getName().equals(otherChr.getName()); - } - - private static ChrIndex getChrIndex(AbstractIndex index, String chr) throws NoSuchFieldException, IllegalAccessException { - Field f = AbstractIndex.class.getDeclaredField("chrIndices"); - f.setAccessible(true); - LinkedHashMap chrIndices = (LinkedHashMap) f.get(index); - return chrIndices.get(chr); - } - - // - // - // Block-Compressed Tabix Index Tests - // - // - - private class BlockCompressedIndexCreatorTest extends TestDataProvider { - private final String extension; - - private BlockCompressedIndexCreatorTest(String extension) { - super(BlockCompressedIndexCreatorTest.class); - - this.extension = extension; - } - - public String toString() { - return String.format("File extension %s", extension); - } - } - - @DataProvider(name = "BlockCompressedIndexDataProvider") - public Object[][] blockCompressedIndexCreatorData() { - for (final String extension : AbstractFeatureReader.BLOCK_COMPRESSED_EXTENSIONS) - new BlockCompressedIndexCreatorTest(".vcf" + extension); - - return TestDataProvider.getTests(BlockCompressedIndexCreatorTest.class); - } - - @Test(dataProvider = "BlockCompressedIndexDataProvider") - public void testBlockCompressedIndexCreation(BlockCompressedIndexCreatorTest testSpec) throws NoSuchFieldException, IllegalAccessException { - - final String commandLine = " -T SelectVariants" + - " -R " + b37KGReference + - " --no_cmdline_in_header" + - " -L 20" + - " -V " + b37_NA12878_OMNI; - final String name = "testBlockCompressedIndexCreation: " + testSpec.toString(); - - File outVCF = createTempFile("testBlockCompressedIndexCreation", testSpec.extension); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("")); - spec.disableShadowBCF(); - spec.setOutputFileLocation(outVCF); - - executeTest(name, spec); - - File outTribbleIdx = new File(outVCF.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION); - Assert.assertFalse(outTribbleIdx.exists(), "testBlockCompressedIndexCreation: Want Tabix index but Tribble index exists: " + outTribbleIdx); - - File outTabixIdx = new File(outVCF.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION); - final Index actualIndex = IndexFactory.loadIndex(outTabixIdx.toString()); - Assert.assertTrue(actualIndex instanceof TabixIndex, "testBlockCompressedIndexCreation: Want Tabix index but index is not Tabix: " + outTabixIdx); - } - - // - // - // Block-Compressed Input Tests - // - // - - private class BlockCompressedInputTest extends TestDataProvider { - private final String extension; - - private BlockCompressedInputTest(String extension) { - super(BlockCompressedInputTest.class); - - this.extension = extension; - } - - public String toString() { - return String.format("File extension %s", extension); - } - } - - @DataProvider(name = "BlockCompressedInputDataProvider") - public Object[][] blockCompressedInputData() { - for (final String extension : AbstractFeatureReader.BLOCK_COMPRESSED_EXTENSIONS) - new BlockCompressedInputTest(".vcf" + extension); - - return TestDataProvider.getTests(BlockCompressedInputTest.class); - } - - @Test(dataProvider = "BlockCompressedInputDataProvider") - public void testBlockCompressedInput(BlockCompressedInputTest testSpec) { - - File inputFile = new File(BaseTest.privateTestDir, "block_compressed_input_test" + testSpec.extension); - final String commandLine = " -T SelectVariants" + - " -R " + b37KGReference + - " --no_cmdline_in_header" + - " -V " + inputFile + - " -o %s "; - final String name = "testBlockCompressedInput: " + testSpec.toString(); - - final WalkerTestSpec spec = new WalkerTestSpec(commandLine, 1, Arrays.asList("3b60668bd973e43783d0406de80d2ed2")); - - executeTest(name, spec); - } - -} diff --git a/public/gatk-utils/pom.xml b/public/gatk-utils/pom.xml index 27660dd61..e1aae406a 100644 --- a/public/gatk-utils/pom.xml +++ b/public/gatk-utils/pom.xml @@ -25,10 +25,6 @@ samtools htsjdk - - picard - picard - log4j log4j @@ -41,26 +37,19 @@ it.unimi.dsi fastutil - - org.simpleframework - simple-xml - org.reflections reflections + org.slf4j - slf4j-log4j12 + slf4j-api org.freemarker freemarker - - org.apache.commons - commons-jexl - commons-lang commons-lang @@ -81,10 +70,6 @@ net.java.dev.jna jna - - net.java.dev.jets3t - jets3t - us.levk drmaa-gridengine @@ -117,6 +102,16 @@ + + org.apache.maven.plugins + maven-assembly-plugin + + + example-resources + ${gatk.generate-resources.phase} + + + org.apache.maven.plugins maven-dependency-plugin @@ -152,8 +147,6 @@ - org.apache.maven.plugins maven-invoker-plugin diff --git a/public/gatk-engine/src/main/assembly/example-resources.xml b/public/gatk-utils/src/main/assembly/example-resources.xml similarity index 100% rename from public/gatk-engine/src/main/assembly/example-resources.xml rename to public/gatk-utils/src/main/assembly/example-resources.xml diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/AutoFormattingTime.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/AutoFormattingTime.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/AutoFormattingTime.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/AutoFormattingTime.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/BaseUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/BaseUtils.java new file mode 100644 index 000000000..ecb8bbde5 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/BaseUtils.java @@ -0,0 +1,671 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import htsjdk.samtools.util.StringUtil; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.Random; + +/** + * BaseUtils contains some basic utilities for manipulating nucleotides. + */ +public class BaseUtils { + + public enum Base { + A ('A'), + C ('C'), + G ('G'), + T ('T'), + N ('N'), + D ('D'); + + public byte base; + + private Base(final char base) { + this.base = (byte)base; + } + } + + // todo -- add this to the generalized base abstraction using the Base enum. + public final static byte[] BASES = {'A', 'C', 'G', 'T'}; + public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; + + static private final int[] baseIndexMap = new int[256]; + static { + Arrays.fill(baseIndexMap, -1); + baseIndexMap['A'] = Base.A.ordinal(); + baseIndexMap['a'] = Base.A.ordinal(); + baseIndexMap['*'] = Base.A.ordinal(); // the wildcard character counts as an A + baseIndexMap['C'] = Base.C.ordinal(); + baseIndexMap['c'] = Base.C.ordinal(); + baseIndexMap['G'] = Base.G.ordinal(); + baseIndexMap['g'] = Base.G.ordinal(); + baseIndexMap['T'] = Base.T.ordinal(); + baseIndexMap['t'] = Base.T.ordinal(); + } + + static private final int[] baseIndexWithIupacMap = baseIndexMap.clone(); + static { + baseIndexWithIupacMap['*'] = -1; // the wildcard character is bad + baseIndexWithIupacMap['N'] = Base.N.ordinal(); + baseIndexWithIupacMap['n'] = Base.N.ordinal(); + baseIndexWithIupacMap['R'] = Base.N.ordinal(); + baseIndexWithIupacMap['r'] = Base.N.ordinal(); + baseIndexWithIupacMap['Y'] = Base.N.ordinal(); + baseIndexWithIupacMap['y'] = Base.N.ordinal(); + baseIndexWithIupacMap['M'] = Base.N.ordinal(); + baseIndexWithIupacMap['m'] = Base.N.ordinal(); + baseIndexWithIupacMap['K'] = Base.N.ordinal(); + baseIndexWithIupacMap['k'] = Base.N.ordinal(); + baseIndexWithIupacMap['W'] = Base.N.ordinal(); + baseIndexWithIupacMap['w'] = Base.N.ordinal(); + baseIndexWithIupacMap['S'] = Base.N.ordinal(); + baseIndexWithIupacMap['s'] = Base.N.ordinal(); + baseIndexWithIupacMap['B'] = Base.N.ordinal(); + baseIndexWithIupacMap['b'] = Base.N.ordinal(); + baseIndexWithIupacMap['D'] = Base.N.ordinal(); + baseIndexWithIupacMap['d'] = Base.N.ordinal(); + baseIndexWithIupacMap['H'] = Base.N.ordinal(); + baseIndexWithIupacMap['h'] = Base.N.ordinal(); + baseIndexWithIupacMap['V'] = Base.N.ordinal(); + baseIndexWithIupacMap['v'] = Base.N.ordinal(); + } + + /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or + // a pyrimidine to another pyrimidine nucleotide (C <-> T). + // Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions. + public enum BaseSubstitutionType { + TRANSITION, // A <-> G or C <-> T + TRANSVERSION + } + + /** + * Returns the base substitution type of the 2 state SNP + * + * @param base1 + * @param base2 + * @return + */ + public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) { + BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION; + //System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t); + return t; + } + + public static boolean isTransition(byte base1, byte base2) { + final int b1 = simpleBaseToBaseIndex(base1); + final int b2 = simpleBaseToBaseIndex(base2); + return b1 == Base.A.ordinal() && b2 == Base.G.ordinal() || b1 == Base.G.ordinal() && b2 == Base.A.ordinal() || + b1 == Base.C.ordinal() && b2 == Base.T.ordinal() || b1 == Base.T.ordinal() && b2 == Base.C.ordinal(); + } + + public static boolean isTransversion(byte base1, byte base2) { + return !isTransition(base1, base2); + } + + /** + * Private constructor. No instantiating this class! + */ + private BaseUtils() {} + + static public boolean basesAreEqual(byte base1, byte base2) { + return simpleBaseToBaseIndex(base1) == simpleBaseToBaseIndex(base2); + } + + /** + * Checks whether to bases are the same in fact ignore ambiguous 'N' bases. + * + * @param base1 first base to compare. + * @param base2 second base to compare. + * @return true if {@code base1 == base2} or either is an 'N', false otherwise. + */ + static public boolean basesAreEqualIgnoreAmbiguous(final byte base1, final byte base2) { + if (base1 == base2) return true; + else if (base1 == 'n' || base1 == 'N' || base2 == 'N' || base2 == 'n') return true; + else return false; + } + + /** + * Compare to base arrays ranges checking whether they contain the same bases. + * + *

+ * By default two array have equal bases, i.e. {@code length == 0} results results in {@code true}. + *

+ * + * @param bases1 first base array to compare. + * @param offset1 position of the first base in bases1 to compare. + * @param bases2 second base array to compare. + * @param offset2 position of the first base in bases2 to compare. + * @param length number of bases to compare. + * + * @throws NullPointerException if {@code bases1} or {@code bases2} is {@code null}. + * @throws ArrayIndexOutOfBoundsException if: + *
    + *
  • {@code offset1} is not within the range [0,{@code bases1.length}) or
  • + *
  • {@code offset2} is not within the range [0,{@code bases2.length}) or
  • + *
  • {@code offset1 + length} is not within the range [0,{@code bases1.length}) or
  • + *
  • {@code offset2 + length} is not within the range [0,{@code bases2.length})
  • + *
+ * @return + */ + static public boolean basesAreEqualIgnoreAmbiguous(final byte[] bases1, final int offset1, final byte[] bases2, final int offset2, final int length) { + for (int i = 0; i < length; i++) + if (!basesAreEqualIgnoreAmbiguous(bases1[offset1 + i],bases2[offset2 + i])) return false; + return true; + } + + static public boolean extendedBasesAreEqual(byte base1, byte base2) { + return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2); + } + + /** + * @return true iff the bases array contains at least one instance of base + */ + static public boolean containsBase(final byte[] bases, final byte base) { + for ( final byte b : bases ) { + if ( b == base ) + return true; + } + return false; + } + + public static boolean isUpperCase(final byte[] bases) { + for ( byte base : bases ) + if ( ! isUpperCase(base) ) + return false; + return true; + } + + public static boolean isUpperCase(final byte base) { + return base >= 'A' && base <= 'Z'; + } + + public static byte[] convertIUPACtoN(final byte[] bases, final boolean errorOnBadReferenceBase, final boolean ignoreConversionOfFirstByte) { + final int length = bases.length; + final int start = ignoreConversionOfFirstByte ? 1 : 0; + + for ( int i = start; i < length; i++ ) { + final int baseIndex = baseIndexWithIupacMap[bases[i]]; + if ( baseIndex == Base.N.ordinal() ) { + bases[i] = 'N'; + } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { + throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); + } + } + return bases; + } + + /** + * Converts a IUPAC nucleotide code to a pair of bases + * + * @param code + * @return 0, 1, 2, 3, or -1 if the base can't be understood + */ + @Deprecated + static public char[] iupacToBases(char code) { + char[] bases = new char[2]; + switch (code) { + case '*': // the wildcard character counts as an A + case 'A': + case 'a': + bases[0] = bases[1] = 'A'; + break; + case 'C': + case 'c': + bases[0] = bases[1] = 'C'; + break; + case 'G': + case 'g': + bases[0] = bases[1] = 'G'; + break; + case 'T': + case 't': + bases[0] = bases[1] = 'T'; + break; + case 'R': + case 'r': + bases[0] = 'A'; + bases[1] = 'G'; + break; + case 'Y': + case 'y': + bases[0] = 'C'; + bases[1] = 'T'; + break; + case 'S': + case 's': + bases[0] = 'G'; + bases[1] = 'C'; + break; + case 'W': + case 'w': + bases[0] = 'A'; + bases[1] = 'T'; + break; + case 'K': + case 'k': + bases[0] = 'G'; + bases[1] = 'T'; + break; + case 'M': + case 'm': + bases[0] = 'A'; + bases[1] = 'C'; + break; + default: + bases[0] = bases[1] = 'N'; + } + return bases; + } + + /** + * Converts a pair of bases to their IUPAC ambiguity code + * + * @param base1 1st base + * @param base2 2nd base + * @return byte + */ + static public byte basesToIUPAC(final byte base1, final byte base2) { + // ensure that the bases come in order + if ( base2 < base1 ) + return basesToIUPAC(base2, base1); + + // ensure that the bases are regular ones + if ( !isRegularBase(base1) || !isRegularBase(base2) ) + return Base.N.base; + + // IUPAC codes are not needed if the bases are identical + if ( basesAreEqual(base1, base2) ) + return base1; + + if ( base1 == Base.A.base ) + return (byte)(base2 == Base.C.base ? 'M' : (base2 == Base.G.base ? 'R' : 'W')); + + if ( base1 == Base.C.base ) + return (byte)(base2 == Base.G.base ? 'S' : 'Y'); + + // the only possibility left is G/T + return 'K'; + } + + /** + * Converts a simple base to a base index + * + * @param base [AaCcGgTt] + * @return 0, 1, 2, 3, or -1 if the base can't be understood + */ + static public int simpleBaseToBaseIndex(final byte base) { + if ( base < 0 || base >= 256 ) + throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)"); + return baseIndexMap[base]; + } + + /** + * Converts a simple base to a base index + * + * @param base [AaCcGgTt] + * @return 0, 1, 2, 3, or -1 if the base can't be understood + */ + @Deprecated + static public int simpleBaseToBaseIndex(char base) { + return baseIndexMap[base]; + } + + static public int extendedBaseToBaseIndex(byte base) { + switch (base) { + case 'd': + case 'D': + return Base.D.ordinal(); + case 'n': + case 'N': + return Base.N.ordinal(); + + default: + return simpleBaseToBaseIndex(base); + } + } + + @Deprecated + static public boolean isRegularBase( final char base ) { + return simpleBaseToBaseIndex(base) != -1; + } + + static public boolean isRegularBase( final byte base ) { + return simpleBaseToBaseIndex(base) != -1; + } + + static public boolean isAllRegularBases( final byte[] bases ) { + for( final byte base : bases) { + if( !isRegularBase(base) ) { return false; } + } + return true; + } + + static public boolean isNBase(byte base) { + return base == 'N' || base == 'n'; + } + + /** + * Converts a base index to a simple base + * + * @param baseIndex 0, 1, 2, 3 + * @return A, C, G, T, or '.' if the index can't be understood + */ + static public byte baseIndexToSimpleBase(int baseIndex) { + switch (baseIndex) { + case 0: + return 'A'; + case 1: + return 'C'; + case 2: + return 'G'; + case 3: + return 'T'; + default: + return '.'; + } + } + + /** + * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). + * + * @param base the base [AaCcGgTt] + * @return the complementary base, or the input base if it's not one of the understood ones + */ + static public byte simpleComplement(byte base) { + switch (base) { + case 'A': + case 'a': + return 'T'; + case 'C': + case 'c': + return 'G'; + case 'G': + case 'g': + return 'C'; + case 'T': + case 't': + return 'A'; + default: + return base; + } + } + + @Deprecated + static private char simpleComplement(char base) { + return (char) simpleComplement((byte) base); + } + + /** + * Reverse complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) + * + * @param bases the byte array of bases + * @return the reverse complement of the base byte array + */ + static public byte[] simpleReverseComplement(byte[] bases) { + byte[] rcbases = new byte[bases.length]; + + for (int i = 0; i < bases.length; i++) { + rcbases[i] = simpleComplement(bases[bases.length - 1 - i]); + } + + return rcbases; + } + + /** + * Reverse complement a char array of bases + * + * @param bases the char array of bases + * @return the reverse complement of the char byte array + */ + @Deprecated + static public char[] simpleReverseComplement(char[] bases) { + char[] rcbases = new char[bases.length]; + + for (int i = 0; i < bases.length; i++) { + rcbases[i] = simpleComplement(bases[bases.length - 1 - i]); + } + + return rcbases; + } + + /** + * Reverse complement a String of bases. Preserves ambiguous bases. + * + * @param bases the String of bases + * @return the reverse complement of the String + */ + @Deprecated + static public String simpleReverseComplement(String bases) { + return new String(simpleReverseComplement(bases.getBytes())); + } + + /** + * Returns the uppercased version of the bases + * + * @param bases the bases + * @return the upper cased version + */ + static public void convertToUpperCase(final byte[] bases) { + StringUtil.toUpperCase(bases); + } + + /** + * Returns the index of the most common base in the basecounts array. To be used with + * pileup.getBaseCounts. + * + * @param baseCounts counts of a,c,g,t in order. + * @return the index of the most common base + */ + static public int mostFrequentBaseIndex(int[] baseCounts) { + int mostFrequentBaseIndex = 0; + for (int baseIndex = 1; baseIndex < 4; baseIndex++) { + if (baseCounts[baseIndex] > baseCounts[mostFrequentBaseIndex]) { + mostFrequentBaseIndex = baseIndex; + } + } + return mostFrequentBaseIndex; + } + + static public int mostFrequentBaseIndexNotRef(int[] baseCounts, int refBaseIndex) { + int tmp = baseCounts[refBaseIndex]; + baseCounts[refBaseIndex] = -1; + int result = mostFrequentBaseIndex(baseCounts); + baseCounts[refBaseIndex] = tmp; + return result; + } + + static public int mostFrequentBaseIndexNotRef(int[] baseCounts, byte refSimpleBase) { + return mostFrequentBaseIndexNotRef(baseCounts, simpleBaseToBaseIndex(refSimpleBase)); + } + + /** + * Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts. + * + * @param baseCounts counts of a,c,g,t in order. + * @return the most common base + */ + static public byte mostFrequentSimpleBase(int[] baseCounts) { + return baseIndexToSimpleBase(mostFrequentBaseIndex(baseCounts)); + } + + /** + * For the most frequent base in the sequence, return the percentage of the read it constitutes. + * + * @param sequence the read sequence + * @return the percentage of the read that's made up of the most frequent base + */ + static public double mostFrequentBaseFraction(byte[] sequence) { + int[] baseCounts = new int[4]; + + for (byte base : sequence) { + int baseIndex = simpleBaseToBaseIndex(base); + + if (baseIndex >= 0) { + baseCounts[baseIndex]++; + } + } + + int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts); + + return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length); + } + + // -------------------------------------------------------------------------------- + // + // random bases + // + // -------------------------------------------------------------------------------- + + /** + * Return a random base index (A=0, C=1, G=2, T=3). + * + * @return a random base index (A=0, C=1, G=2, T=3) + */ + static public int getRandomBaseIndex() { + return getRandomBaseIndex(-1); + } + + /** + * Return random bases. + * + * @param length base count and length of returned array. + * + * @throws IllegalArgumentException if {@code length} is less than 0. + * + * @return never {@code null} + */ + @SuppressWarnings("unused") + public static byte[] getRandomBases(final int length) { + if (length < 0) + throw new IllegalArgumentException("length must zero or greater"); + final byte[] result = new byte[length]; + fillWithRandomBases(result); + return result; + } + + /** + * Fills an array with random bases. + * + * @param dest the array to fill. + * + * @throws IllegalArgumentException if {@code result} is {@code null}. + */ + public static void fillWithRandomBases(final byte[] dest) { + fillWithRandomBases(dest,0,dest.length); + } + + /** + * Fill an array section with random bases. + * + * @param dest array to fill. + * @param fromIndex first index to be filled (inclusive). + * @param toIndex index after last to be filled (exclusive). + * + * @throws IllegalArgumentException if {@code dest} is {@code null}, + * {@code fromIndex} or {@code toIndex} is negative, + * {@code fromIndex} or {@code toIndex} are greater than {@code dest} length, + * or {@code fromIndex} greater than {@code toIndex}. + */ + public static void fillWithRandomBases(final byte[] dest, final int fromIndex, final int toIndex) { + final Random rnd = Utils.getRandomGenerator(); + if (dest == null) + throw new IllegalArgumentException("the dest array cannot be null"); + if (fromIndex > toIndex) + throw new IllegalArgumentException("fromIndex cannot be larger than toIndex"); + if (fromIndex < 0) + throw new IllegalArgumentException("both indexes must be positive"); + if (toIndex > dest.length) + throw new IllegalArgumentException("both indexes must be less or equal to the destination array length"); + + for (int i = fromIndex; i < toIndex; i++) + dest[i] = baseIndexToSimpleBase(rnd.nextInt(4)); + } + + /** + * Return a random base index, excluding some base index. + * + * @param excludeBaseIndex the base index to exclude + * @return a random base index, excluding the one specified (A=0, C=1, G=2, T=3) + */ + static public int getRandomBaseIndex(int excludeBaseIndex) { + int randomBaseIndex = excludeBaseIndex; + + while (randomBaseIndex == excludeBaseIndex) { + randomBaseIndex = Utils.getRandomGenerator().nextInt(4); + } + + return randomBaseIndex; + } + + public static byte getComplement(byte base) { + switch(base) { + case 'a': + case 'A': + return 'T'; + case 'c': + case 'C': + return 'G'; + case 'g': + case 'G': + return 'C'; + case 't': + case 'T': + return 'A'; + case 'n': + case 'N': + return 'N'; + default: + throw new ReviewedGATKException("base must be A, C, G or T. " + (char) base + " is not a valid base."); + } + } + + + /** + * Lexicographical sorting of base arrays {@link Comparator}. + */ + public static final Comparator BASES_COMPARATOR = new Comparator (){ + + @Override + public int compare(final byte[] o1,final byte[] o2) { + final int minLength = Math.min(o1.length,o2.length); + for (int i = 0; i < minLength; i++) { + final int cmp = Byte.compare(o1[i],o2[i]); + if (cmp != 0) return cmp; + } + if (o1.length == o2.length) + return 0; + else if (o1.length == minLength) + return -1; + else + return 1; + } + }; +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/BitSetUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/BitSetUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/BitSetUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/BitSetUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/ContigComparator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ContigComparator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/ContigComparator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ContigComparator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/DeprecatedToolChecks.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/DeprecatedToolChecks.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/DeprecatedToolChecks.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/DeprecatedToolChecks.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocParser.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/GenomeLocSortedSet.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/HeapSizeMonitor.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/HeapSizeMonitor.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/HeapSizeMonitor.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/HeapSizeMonitor.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/IndelUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/IndelUtils.java new file mode 100644 index 000000000..3c6b48cc3 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/IndelUtils.java @@ -0,0 +1,262 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import htsjdk.variant.variantcontext.VariantContext; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: delangel + * Date: Feb 3, 2011 + * Time: 2:44:22 PM + * To change this template use File | Settings | File Templates. + */ +public class IndelUtils { + protected final static String[] COLUMN_KEYS; + + + + static { + COLUMN_KEYS= new String[51]; + COLUMN_KEYS[0] = "Novel_A"; + COLUMN_KEYS[1] = "Novel_C"; + COLUMN_KEYS[2] = "Novel_G"; + COLUMN_KEYS[3] = "Novel_T"; + COLUMN_KEYS[4] = "NOVEL_1"; + COLUMN_KEYS[5] = "NOVEL_2"; + COLUMN_KEYS[6] = "NOVEL_3"; + COLUMN_KEYS[7] = "NOVEL_4"; + COLUMN_KEYS[8] = "NOVEL_5"; + COLUMN_KEYS[9] = "NOVEL_6"; + COLUMN_KEYS[10] = "NOVEL_7"; + COLUMN_KEYS[11] = "NOVEL_8"; + COLUMN_KEYS[12] = "NOVEL_9"; + COLUMN_KEYS[13] = "NOVEL_10orMore"; + COLUMN_KEYS[14] = "RepeatExpansion_A"; + COLUMN_KEYS[15] = "RepeatExpansion_C"; + COLUMN_KEYS[16] = "RepeatExpansion_G"; + COLUMN_KEYS[17] = "RepeatExpansion_T"; + COLUMN_KEYS[18] = "RepeatExpansion_AC"; + COLUMN_KEYS[19] = "RepeatExpansion_AG"; + COLUMN_KEYS[20] = "RepeatExpansion_AT"; + COLUMN_KEYS[21] = "RepeatExpansion_CA"; + COLUMN_KEYS[22] = "RepeatExpansion_CG"; + COLUMN_KEYS[23] = "RepeatExpansion_CT"; + COLUMN_KEYS[24] = "RepeatExpansion_GA"; + COLUMN_KEYS[25] = "RepeatExpansion_GC"; + COLUMN_KEYS[26] = "RepeatExpansion_GT"; + COLUMN_KEYS[27] = "RepeatExpansion_TA"; + COLUMN_KEYS[28] = "RepeatExpansion_TC"; + COLUMN_KEYS[29] = "RepeatExpansion_TG"; + COLUMN_KEYS[30] = "EventLength_1"; + COLUMN_KEYS[31] = "EventLength_2"; + COLUMN_KEYS[32] = "EventLength_3"; + COLUMN_KEYS[33] = "EventLength_4"; + COLUMN_KEYS[34] = "EventLength_5"; + COLUMN_KEYS[35] = "EventLength_6"; + COLUMN_KEYS[36] = "EventLength_7"; + COLUMN_KEYS[37] = "EventLength_8"; + COLUMN_KEYS[38] = "EventLength_9"; + COLUMN_KEYS[39] = "EventLength_10orMore"; + COLUMN_KEYS[40] = "NumRepetitions_1"; + COLUMN_KEYS[41] = "NumRepetitions_2"; + COLUMN_KEYS[42] = "NumRepetitions_3"; + COLUMN_KEYS[43] = "NumRepetitions_4"; + COLUMN_KEYS[44] = "NumRepetitions_5"; + COLUMN_KEYS[45] = "NumRepetitions_6"; + COLUMN_KEYS[46] = "NumRepetitions_7"; + COLUMN_KEYS[47] = "NumRepetitions_8"; + COLUMN_KEYS[48] = "NumRepetitions_9"; + COLUMN_KEYS[49] = "NumRepetitions_10orMore"; + COLUMN_KEYS[50] = "Other"; + + } + + private static final int START_IND_NOVEL = 4; + private static final int STOP_IND_NOVEL = 13; + private static final int START_IND_FOR_REPEAT_EXPANSION_1 = 14; + private static final int IND_FOR_REPEAT_EXPANSION_A = 14; + private static final int IND_FOR_REPEAT_EXPANSION_C = 15; + private static final int IND_FOR_REPEAT_EXPANSION_G = 16; + private static final int IND_FOR_REPEAT_EXPANSION_T = 17; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_2 = 29; + private static final int START_IND_FOR_REPEAT_EXPANSION_COUNTS = 30; + private static final int STOP_IND_FOR_REPEAT_EXPANSION_COUNTS = 39; + private static final int START_IND_FOR_NUM_REPETITION_COUNTS = 40; + private static final int STOP_IND_FOR_NUM_REPETITION_COUNTS = 49; + private static final int IND_FOR_OTHER_EVENT = 50; + private static final int START_IND_NOVEL_PER_BASE = 0; + private static final int STOP_IND_NOVEL_PER_BASE = 3; + + private static String findMinimalEvent(String eventString) { + + // for each length up to given string length, see if event string is a repetition of units of size N + String minEvent = eventString; + for (int k=1; k < eventString.length(); k++) { + if (eventString.length() % k > 0) + continue; + String str = eventString.substring(0,k); + // now see if event string is a repetition of str + int numReps = eventString.length() / k; + String r = ""; + for (int j=0; j < numReps; j++) + r = r.concat(str); + + if (r.matches(eventString)) { + minEvent = str; + break; + } + + } + return minEvent; + } + + public static ArrayList findEventClassificationIndex(VariantContext vc, ReferenceContext ref) { + int eventLength; + + String indelAlleleString; + boolean done = false; + + ArrayList inds = new ArrayList(); + if ( vc.isSimpleInsertion() ) { + indelAlleleString = vc.getAlternateAllele(0).getDisplayString().substring(1); + } else if ( vc.isSimpleDeletion() ) { + indelAlleleString = vc.getReference().getDisplayString().substring(1); + } + else { + inds.add(IND_FOR_OTHER_EVENT); + return inds; + } + + byte[] refBases = ref.getBases(); + + indelAlleleString = findMinimalEvent(indelAlleleString); + eventLength = indelAlleleString.length(); + + // See first if indel is a repetition of bases before current + int indStart = refBases.length/2-eventLength+1; + + int numRepetitions = 0; + while (!done) { + if (indStart < 0) + done = true; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart - eventLength; + } + else + done = true; + + } + } + + // now do it forward + done = false; + indStart = refBases.length/2+1; + while (!done) { + if (indStart + eventLength >= refBases.length) + break; + else { + String refPiece = new String(Arrays.copyOfRange(refBases,indStart,indStart+eventLength)); + if (refPiece.matches(indelAlleleString)) + { + numRepetitions++; + indStart = indStart + eventLength; + } + else + done = true; + + } + } + + if (numRepetitions == 0) { + //unrepeated sequence from surroundings + int ind = START_IND_NOVEL + (eventLength-1); + if (ind > STOP_IND_NOVEL) + ind = STOP_IND_NOVEL; + inds.add(ind); + + if (eventLength == 1) { + // log single base indels additionally by base + String keyStr = "Novel_" + indelAlleleString; + int k; + for (k=START_IND_NOVEL_PER_BASE; k <= STOP_IND_NOVEL_PER_BASE; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + inds.add(k); + } + } + else { + // log number of repetition counts + int ind = START_IND_FOR_NUM_REPETITION_COUNTS + (numRepetitions-1); + if (ind > STOP_IND_FOR_NUM_REPETITION_COUNTS) + ind = STOP_IND_FOR_NUM_REPETITION_COUNTS; + inds.add(ind); + + ind = START_IND_FOR_REPEAT_EXPANSION_COUNTS + (eventLength - 1); + if (ind > STOP_IND_FOR_REPEAT_EXPANSION_COUNTS) + ind = STOP_IND_FOR_REPEAT_EXPANSION_COUNTS; + inds.add(ind); + + // log event length + if (eventLength<=2) { + // for single or dinucleotide indels, we further log the base in which they occurred + String keyStr = "RepeatExpansion_" + indelAlleleString; + int k; + for (k=START_IND_FOR_REPEAT_EXPANSION_1; k <= STOP_IND_FOR_REPEAT_EXPANSION_2; k++) { + if (keyStr.matches(COLUMN_KEYS[k])) + break; + } + // log now event + inds.add(k); + } + + + } + + return inds; + } + + public static String getIndelClassificationName(int k) { + if (k >=0 && k < COLUMN_KEYS.length) + return COLUMN_KEYS[k]; + else + throw new ReviewedGATKException("Invalid index when trying to get indel classification name"); + } + + public static boolean isInsideExtendedIndel(VariantContext vc, ReferenceContext ref) { + return (vc.getStart() != ref.getLocus().getStart()); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/LRUCache.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/LRUCache.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/LRUCache.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/LRUCache.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequenceDictionary.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequenceDictionary.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequenceDictionary.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequenceDictionary.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java new file mode 100644 index 000000000..61261f217 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MannWhitneyU.java @@ -0,0 +1,507 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import cern.jet.math.Arithmetic; +import cern.jet.random.Normal; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.math.MathException; +import org.apache.commons.math.distribution.NormalDistribution; +import org.apache.commons.math.distribution.NormalDistributionImpl; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.GATKException; + +import java.io.Serializable; +import java.util.Comparator; +import java.util.TreeSet; + +/** + * Created by IntelliJ IDEA. + * User: chartl + */ +public class MannWhitneyU { + + private static Normal STANDARD_NORMAL = new Normal(0.0,1.0,null); + private static NormalDistribution APACHE_NORMAL = new NormalDistributionImpl(0.0,1.0,1e-2); + private static double LNSQRT2PI = Math.log(Math.sqrt(2.0*Math.PI)); + + private TreeSet> observations; + private int sizeSet1; + private int sizeSet2; + private ExactMode exactMode; + + public MannWhitneyU(ExactMode mode, boolean dither) { + if ( dither ) + observations = new TreeSet>(new DitheringComparator()); + else + observations = new TreeSet>(new NumberedPairComparator()); + sizeSet1 = 0; + sizeSet2 = 0; + exactMode = mode; + } + + public MannWhitneyU() { + this(ExactMode.POINT,true); + } + + public MannWhitneyU(boolean dither) { + this(ExactMode.POINT,dither); + } + + public MannWhitneyU(ExactMode mode) { + this(mode,true); + } + + /** + * Add an observation into the observation tree + * @param n: the observation (a number) + * @param set: whether the observation comes from set 1 or set 2 + */ + public void add(Number n, USet set) { + observations.add(new Pair(n,set)); + if ( set == USet.SET1 ) { + ++sizeSet1; + } else { + ++sizeSet2; + } + } + + public Pair getR1R2() { + long u1 = calculateOneSidedU(observations,MannWhitneyU.USet.SET1); + long n1 = sizeSet1*(sizeSet1+1)/2; + long r1 = u1 + n1; + long n2 = sizeSet2*(sizeSet2+1)/2; + long u2 = n1*n2-u1; + long r2 = u2 + n2; + + return new Pair(r1,r2); + } + + /** + * Runs the one-sided test under the hypothesis that the data in set "lessThanOther" stochastically + * dominates the other set + * @param lessThanOther - either Set1 or Set2 + * @return - u-based z-approximation, and p-value associated with the test (p-value is exact for small n,m) + */ + @Requires({"lessThanOther != null"}) + @Ensures({"validateObservations(observations) || Double.isNaN(result.getFirst())","result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) + public Pair runOneSidedTest(USet lessThanOther) { + long u = calculateOneSidedU(observations, lessThanOther); + int n = lessThanOther == USet.SET1 ? sizeSet1 : sizeSet2; + int m = lessThanOther == USet.SET1 ? sizeSet2 : sizeSet1; + if ( n == 0 || m == 0 ) { + // test is uninformative as one or both sets have no observations + return new Pair(Double.NaN,Double.NaN); + } + + // the null hypothesis is that {N} is stochastically less than {M}, so U has counted + // occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under + // the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always. + return calculateP(n, m, u, false, exactMode); + } + + /** + * Runs the standard two-sided test, + * returns the u-based z-approximate and p values. + * @return a pair holding the u and p-value. + */ + @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) + //@Requires({"validateObservations(observations)"}) + public Pair runTwoSidedTest() { + Pair uPair = calculateTwoSidedU(observations); + long u = uPair.first; + int n = uPair.second == USet.SET1 ? sizeSet1 : sizeSet2; + int m = uPair.second == USet.SET1 ? sizeSet2 : sizeSet1; + if ( n == 0 || m == 0 ) { + // test is uninformative as one or both sets have no observations + return new Pair(Double.NaN,Double.NaN); + } + return calculateP(n, m, u, true, exactMode); + } + + /** + * Given a u statistic, calculate the p-value associated with it, dispatching to approximations where appropriate + * @param n - The number of entries in the stochastically smaller (dominant) set + * @param m - The number of entries in the stochastically larger (dominated) set + * @param u - the Mann-Whitney U value + * @param twoSided - is the test twosided + * @return the (possibly approximate) p-value associated with the MWU test, and the (possibly approximate) z-value associated with it + * todo -- there must be an approximation for small m and large n + */ + @Requires({"m > 0","n > 0"}) + @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) + protected static Pair calculateP(int n, int m, long u, boolean twoSided, ExactMode exactMode) { + Pair zandP; + if ( n > 8 && m > 8 ) { + // large m and n - normal approx + zandP = calculatePNormalApproximation(n,m,u, twoSided); + } else if ( n > 5 && m > 7 ) { + // large m, small n - sum uniform approx + // todo -- find the appropriate regimes where this approximation is actually better enough to merit slowness + // pval = calculatePUniformApproximation(n,m,u); + zandP = calculatePNormalApproximation(n, m, u, twoSided); + } else if ( n > 8 || m > 8 ) { + zandP = calculatePFromTable(n, m, u, twoSided); + } else { + // small m and n - full approx + zandP = calculatePRecursively(n,m,u,twoSided,exactMode); + } + + return zandP; + } + + public static Pair calculatePFromTable(int n, int m, long u, boolean twoSided) { + // todo -- actually use a table for: + // todo - n large, m small + return calculatePNormalApproximation(n,m,u, twoSided); + } + + /** + * Uses a normal approximation to the U statistic in order to return a cdf p-value. See Mann, Whitney [1947] + * @param n - The number of entries in the stochastically smaller (dominant) set + * @param m - The number of entries in the stochastically larger (dominated) set + * @param u - the Mann-Whitney U value + * @param twoSided - whether the test should be two sided + * @return p-value associated with the normal approximation + */ + @Requires({"m > 0","n > 0"}) + @Ensures({"result != null", "! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) + public static Pair calculatePNormalApproximation(int n,int m,long u, boolean twoSided) { + double z = getZApprox(n,m,u); + if ( twoSided ) { + return new Pair(z,2.0*(z < 0 ? STANDARD_NORMAL.cdf(z) : 1.0-STANDARD_NORMAL.cdf(z))); + } else { + return new Pair(z,STANDARD_NORMAL.cdf(z)); + } + } + + /** + * Calculates the Z-score approximation of the u-statistic + * @param n - The number of entries in the stochastically smaller (dominant) set + * @param m - The number of entries in the stochastically larger (dominated) set + * @param u - the Mann-Whitney U value + * @return the asymptotic z-approximation corresponding to the MWU p-value for n < m + */ + @Requires({"m > 0","n > 0"}) + @Ensures({"! Double.isNaN(result)", "! Double.isInfinite(result)"}) + private static double getZApprox(int n, int m, long u) { + double mean = ( ((long)m)*n+1.0)/2; + double var = (((long) n)*m*(n+m+1.0))/12; + double z = ( u - mean )/Math.sqrt(var); + return z; + } + + /** + * Uses a sum-of-uniform-0-1 random variable approximation to the U statistic in order to return an approximate + * p-value. See Buckle, Kraft, van Eeden [1969] (approx) and Billingsly [1995] or Stephens, MA [1966, biometrika] (sum of uniform CDF) + * @param n - The number of entries in the stochastically smaller (dominant) set + * @param m - The number of entries in the stochastically larger (dominated) set + * @param u - mann-whitney u value + * @return p-value according to sum of uniform approx + * todo -- this is currently not called due to not having a good characterization of where it is significantly more accurate than the + * todo -- normal approxmation (e.g. enough to merit the runtime hit) + */ + public static double calculatePUniformApproximation(int n, int m, long u) { + long R = u + (n*(n+1))/2; + double a = Math.sqrt(m*(n+m+1)); + double b = (n/2.0)*(1-Math.sqrt((n+m+1)/m)); + double z = b + ((double)R)/a; + if ( z < 0 ) { return 1.0; } + else if ( z > n ) { return 0.0; } + else { + if ( z > ((double) n) /2 ) { + return 1.0-1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + } else { + return 1/(Arithmetic.factorial(n))*uniformSumHelper(z, (int) Math.floor(z), n, 0); + } + } + } + + /** + * Helper function for the sum of n uniform random variables + * @param z - value at which to compute the (un-normalized) cdf + * @param m - a cutoff integer (defined by m <= z < m + 1) + * @param n - the number of uniform random variables + * @param k - holder variable for the recursion (alternatively, the index of the term in the sequence) + * @return the (un-normalized) cdf for the sum of n random variables + */ + private static double uniformSumHelper(double z, int m, int n, int k) { + if ( k > m ) { return 0; } + int coef = (k % 2 == 0) ? 1 : -1; + return coef*Arithmetic.binomial(n,k)*Math.pow(z-k,n) + uniformSumHelper(z,m,n,k+1); + } + + /** + * Calculates the U-statistic associated with a two-sided test (e.g. the RV from which one set is drawn + * stochastically dominates the RV from which the other set is drawn); two-sidedness is accounted for + * later on simply by multiplying the p-value by 2. + * + * Recall: If X stochastically dominates Y, the test is for occurrences of Y before X, so the lower value of u is chosen + * @param observed - the observed data + * @return the minimum of the U counts (set1 dominates 2, set 2 dominates 1) + */ + @Requires({"observed != null", "observed.size() > 0"}) + @Ensures({"result != null","result.first > 0"}) + public static Pair calculateTwoSidedU(TreeSet> observed) { + int set1SeenSoFar = 0; + int set2SeenSoFar = 0; + long uSet1DomSet2 = 0; + long uSet2DomSet1 = 0; + USet previous = null; + for ( Pair dataPoint : observed ) { + + if ( dataPoint.second == USet.SET1 ) { + ++set1SeenSoFar; + } else { + ++set2SeenSoFar; + } + + if ( previous != null ) { + if ( dataPoint.second == USet.SET1 ) { + uSet2DomSet1 += set2SeenSoFar; + } else { + uSet1DomSet2 += set1SeenSoFar; + } + } + + previous = dataPoint.second; + } + + return uSet1DomSet2 < uSet2DomSet1 ? new Pair(uSet1DomSet2,USet.SET1) : new Pair(uSet2DomSet1,USet.SET2); + } + + /** + * Calculates the U-statistic associated with the one-sided hypothesis that "dominator" stochastically dominates + * the other U-set. Note that if S1 dominates S2, we want to count the occurrences of points in S2 coming before points in S1. + * @param observed - the observed data points, tagged by each set + * @param dominator - the set that is hypothesized to be stochastically dominating + * @return the u-statistic associated with the hypothesis that dominator stochastically dominates the other set + */ + @Requires({"observed != null","dominator != null","observed.size() > 0"}) + @Ensures({"result >= 0"}) + public static long calculateOneSidedU(TreeSet> observed,USet dominator) { + long otherBeforeDominator = 0l; + int otherSeenSoFar = 0; + for ( Pair dataPoint : observed ) { + if ( dataPoint.second != dominator ) { + ++otherSeenSoFar; + } else { + otherBeforeDominator += otherSeenSoFar; + } + } + + return otherBeforeDominator; + } + + /** + * The Mann-Whitney U statistic follows a recursive equation (that enumerates the proportion of possible + * binary strings of "n" zeros, and "m" ones, where a one precedes a zero "u" times). This accessor + * calls into that recursive calculation. + * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) + * @param m: number of set-two entries + * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) + * @param twoSided: whether the test is two sided or not. The recursive formula is symmetric, multiply by two for two-sidedness. + * @param mode: whether the mode is a point probability, or a cumulative distribution + * @return the probability under the hypothesis that all sequences are equally likely of finding a set-two entry preceding a set-one entry "u" times. + */ + @Requires({"m > 0","n > 0","u >= 0"}) + @Ensures({"result != null","! Double.isInfinite(result.getFirst())", "! Double.isInfinite(result.getSecond())"}) + public static Pair calculatePRecursively(int n, int m, long u, boolean twoSided, ExactMode mode) { + if ( m > 8 && n > 5 ) { throw new GATKException(String.format("Please use the appropriate (normal or sum of uniform) approximation. Values n: %d, m: %d",n,m)); } + double p = mode == ExactMode.POINT ? cpr(n,m,u) : cumulativeCPR(n,m,u); + //p *= twoSided ? 2.0 : 1.0; + double z; + try { + + if ( mode == ExactMode.CUMULATIVE ) { + z = APACHE_NORMAL.inverseCumulativeProbability(p); + } else { + double sd = Math.sqrt((1.0+1.0/(1+n+m))*(n*m)*(1.0+n+m)/12); // biased variance empirically better fit to distribution then asymptotic variance + //System.out.printf("SD is %f and Max is %f and prob is %f%n",sd,1.0/Math.sqrt(sd*sd*2.0*Math.PI),p); + if ( p > 1.0/Math.sqrt(sd*sd*2.0*Math.PI) ) { // possible for p-value to be outside the range of the normal. Happens at the mean, so z is 0. + z = 0.0; + } else { + if ( u >= n*m/2 ) { + z = Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI)); + } else { + z = -Math.sqrt(-2.0*(Math.log(sd)+Math.log(p)+LNSQRT2PI)); + } + } + } + + } catch (MathException me) { + throw new GATKException("A math exception occurred in inverting the probability",me); + } + + return new Pair(z,(twoSided ? 2.0*p : p)); + } + + /** + * Hook into CPR with sufficient warning (for testing purposes) + * calls into that recursive calculation. + * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) + * @param m: number of set-two entries + * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) + * @return same as cpr + */ + protected static double calculatePRecursivelyDoNotCheckValuesEvenThoughItIsSlow(int n, int m, long u) { + return cpr(n,m,u); + } + + /** + * For testing + * + * @param n: number of set-one entries (hypothesis: set one is stochastically less than set two) + * @param m: number of set-two entries + * @param u: number of set-two entries that precede set-one entries (e.g. 0,1,0,1,0 -> 3 ) + */ + protected static long countSequences(int n, int m, long u) { + if ( u < 0 ) { return 0; } + if ( m == 0 || n == 0 ) { return u == 0 ? 1 : 0; } + + return countSequences(n-1,m,u-m) + countSequences(n,m-1,u); + } + + /** + * : just a shorter name for calculatePRecursively. See Mann, Whitney, [1947] + * @param n: number of set-1 entries + * @param m: number of set-2 entries + * @param u: number of times a set-2 entry as preceded a set-1 entry + * @return recursive p-value + */ + private static double cpr(int n, int m, long u) { + if ( u < 0 ) { + return 0.0; + } + if ( m == 0 || n == 0 ) { + // there are entries in set 1 or set 2, so no set-2 entry can precede a set-1 entry; thus u must be zero. + // note that this exists only for edification, as when we reach this point, the coefficient on this term is zero anyway + return ( u == 0 ) ? 1.0 : 0.0; + } + + + return (((double)n)/(n+m))*cpr(n-1,m,u-m) + (((double)m)/(n+m))*cpr(n,m-1,u); + } + + private static double cumulativeCPR(int n, int m, long u ) { + // from above: + // the null hypothesis is that {N} is stochastically less than {M}, so U has counted + // occurrences of {M}s before {N}s. We would expect that this should be less than (n*m+1)/2 under + // the null hypothesis, so we want to integrate from K=0 to K=U for cumulative cases. Always. + double p = 0.0; + // optimization using symmetry, use the least amount of sums possible + long uSym = ( u <= n*m/2 ) ? u : ((long)n)*m-u; + for ( long uu = 0; uu < uSym; uu++ ) { + p += cpr(n,m,uu); + } + // correct by 1.0-p if the optimization above was used (e.g. 1-right tail = left tail) + return (u <= n*m/2) ? p : 1.0-p; + } + + /** + * hook into the data tree, for testing purposes only + * @return observations + */ + protected TreeSet> getObservations() { + return observations; + } + + /** + * hook into the set sizes, for testing purposes only + * @return size set 1, size set 2 + */ + protected Pair getSetSizes() { + return new Pair(sizeSet1,sizeSet2); + } + + /** + * Validates that observations are in the correct format for a MWU test -- this is only called by the contracts API during testing + * @param tree - the collection of labeled observations + * @return true iff the tree set is valid (no INFs or NaNs, at least one data point in each set) + */ + protected static boolean validateObservations(TreeSet> tree) { + boolean seen1 = false; + boolean seen2 = false; + boolean seenInvalid = false; + for ( Pair p : tree) { + if ( ! seen1 && p.getSecond() == USet.SET1 ) { + seen1 = true; + } + + if ( ! seen2 && p.getSecond() == USet.SET2 ) { + seen2 = true; + } + + if ( Double.isNaN(p.getFirst().doubleValue()) || Double.isInfinite(p.getFirst().doubleValue())) { + seenInvalid = true; + } + + } + + return ! seenInvalid && seen1 && seen2; + } + + /** + * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values + * and to ensure that rank ties are broken at random. + */ + private static class DitheringComparator implements Comparator>, Serializable { + + public DitheringComparator() {} + + @Override + public boolean equals(Object other) { return false; } + + @Override + public int compare(Pair left, Pair right) { + double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); + if ( comp > 0 ) { return 1; } + if ( comp < 0 ) { return -1; } + return Utils.getRandomGenerator().nextBoolean() ? -1 : 1; + } + } + + /** + * A comparator that reaches into the pair and compares numbers without tie-braking. + */ + private static class NumberedPairComparator implements Comparator>, Serializable { + + public NumberedPairComparator() {} + + @Override + public boolean equals(Object other) { return false; } + + @Override + public int compare(Pair left, Pair right ) { + return Double.compare(left.first.doubleValue(),right.first.doubleValue()); + } + } + + public enum USet { SET1, SET2 } + public enum ExactMode { POINT, CUMULATIVE } + +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java new file mode 100644 index 000000000..614cb927e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MathUtils.java @@ -0,0 +1,1689 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.commons.math.distribution.ExponentialDistribution; +import org.apache.commons.math.distribution.ExponentialDistributionImpl; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.math.BigDecimal; +import java.util.*; + +/** + * MathUtils is a static class (no instantiation allowed!) with some useful math methods. + * + * @author Kiran Garimella + */ +public class MathUtils { + + /** + * Private constructor. No instantiating this class! + */ + private MathUtils() { + } + + /** + * The smallest log10 value we'll emit from normalizeFromLog10 and other functions + * where the real-space value is 0.0. + */ + public static final double LOG10_P_OF_ZERO = -1000000.0; + public static final double FAIR_BINOMIAL_PROB_LOG10_0_5 = Math.log10(0.5); + public static final double LOG_ONE_HALF = -Math.log10(2.0); + public static final double LOG_ONE_THIRD = -Math.log10(3.0); + private static final double NATURAL_LOG_OF_TEN = Math.log(10.0); + private static final double SQUARE_ROOT_OF_TWO_TIMES_PI = Math.sqrt(2.0 * Math.PI); + + /** + * A helper class to maintain a cache of log10 values + */ + public static class Log10Cache { + /** + * Get the value of log10(n), expanding the cache as necessary + * @param n operand + * @return log10(n) + */ + public static double get(final int n) { + if (n < 0) + throw new ReviewedGATKException(String.format("Can't take the log of a negative number: %d", n)); + if (n >= cache.length) + ensureCacheContains(Math.max(n+10, 2*cache.length)); + /* + Array lookups are not atomic. It's possible that the reference to cache could be + changed between the time the reference is loaded and the data is fetched from the correct + offset. However, the value retrieved can't change, and it's guaranteed to be present in the + old reference by the conditional above. + */ + return cache[n]; + } + + /** + * Ensures that the cache contains a value for n. After completion of ensureCacheContains(n), + * #get(n) is guaranteed to return without causing a cache expansion + * @param n desired value to be precomputed + */ + public static synchronized void ensureCacheContains(final int n) { + if (n < cache.length) + return; + final double[] newCache = new double[n + 1]; + System.arraycopy(cache, 0, newCache, 0, cache.length); + for (int i=cache.length; i < newCache.length; i++) + newCache[i] = Math.log10(i); + cache = newCache; + } + + //initialize with the special case: log10(0) = NEGATIVE_INFINITY + private static double[] cache = new double[] { Double.NEGATIVE_INFINITY }; + } + + /** + * Get a random int between min and max (inclusive) using the global GATK random number generator + * + * @param min lower bound of the range + * @param max upper bound of the range + * @return a random int >= min and <= max + */ + public static int randomIntegerInRange( final int min, final int max ) { + return Utils.getRandomGenerator().nextInt(max - min + 1) + min; + } + + /** + * Encapsulates the second term of Jacobian log identity for differences up to MAX_TOLERANCE + */ + private static class JacobianLogTable { + + public static final double MAX_TOLERANCE = 8.0; + + public static double get(final double difference) { + if (cache == null) + initialize(); + final int index = fastRound(difference * INV_STEP); + return cache[index]; + } + + private static synchronized void initialize() { + if (cache == null) { + final int tableSize = (int) (MAX_TOLERANCE / TABLE_STEP) + 1; + cache = new double[tableSize]; + for (int k = 0; k < cache.length; k++) + cache[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * TABLE_STEP)); + } + } + + private static final double TABLE_STEP = 0.0001; + private static final double INV_STEP = 1.0 / TABLE_STEP; + private static double[] cache = null; + } + + // A fast implementation of the Math.round() method. This method does not perform + // under/overflow checking, so this shouldn't be used in the general case (but is fine + // if one is already make those checks before calling in to the rounding). + public static int fastRound(final double d) { + return (d > 0.0) ? (int) (d + 0.5d) : (int) (d - 0.5d); + } + + public static double approximateLog10SumLog10(final double[] vals) { + return approximateLog10SumLog10(vals, vals.length); + } + + /** + * Calculate the approximate log10 sum of an array range. + * @param vals the input values. + * @param fromIndex the first inclusive index in the input array. + * @param toIndex index following the last element to sum in the input array (exclusive). + * @return the approximate sum. + * @throws IllegalArgumentException if {@code vals} is {@code null} or {@code fromIndex} is out of bounds + * or if {@code toIndex} is larger than + * the length of the input array or {@code fromIndex} is larger than {@code toIndex}. + */ + public static double approximateLog10SumLog10(final double[] vals, final int fromIndex, final int toIndex) { + if (fromIndex == toIndex) return Double.NEGATIVE_INFINITY; + final int maxElementIndex = MathUtils.maxElementIndex(vals,fromIndex,toIndex); + double approxSum = vals[maxElementIndex]; + + for (int i = fromIndex; i < toIndex; i++) { + final double val; + if (i == maxElementIndex || (val = vals[i]) == Double.NEGATIVE_INFINITY) + continue; + final double diff = approxSum - val; + if (diff < JacobianLogTable.MAX_TOLERANCE) + approxSum += JacobianLogTable.get(diff); + } + return approxSum; + } + + public static double approximateLog10SumLog10(final double[] vals, final int endIndex) { + + final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex); + double approxSum = vals[maxElementIndex]; + + for (int i = 0; i < endIndex; i++) { + if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY) + continue; + + final double diff = approxSum - vals[i]; + if (diff < JacobianLogTable.MAX_TOLERANCE) { + // See notes from the 2-inout implementation below + approxSum += JacobianLogTable.get(diff); + } + } + + return approxSum; + } + + public static double approximateLog10SumLog10(final double a, final double b, final double c) { + return approximateLog10SumLog10(a, approximateLog10SumLog10(b, c)); + } + + public static double approximateLog10SumLog10(double small, double big) { + // make sure small is really the smaller value + if (small > big) { + final double t = big; + big = small; + small = t; + } + + if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY) + return big; + + final double diff = big - small; + if (diff >= JacobianLogTable.MAX_TOLERANCE) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + return big + JacobianLogTable.get(diff); + } + + public static double sum(final double[] values) { + double s = 0.0; + for (double v : values) + s += v; + return s; + } + + public static long sum(final int[] x) { + long total = 0; + for (int v : x) + total += v; + return total; + } + + public static int sum(final byte[] x) { + int total = 0; + for (byte v : x) + total += (int)v; + return total; + } + + public static double percentage(int x, int base) { + return (base > 0 ? ((double) x / (double) base) * 100.0 : 0); + } + + public static double ratio(final int num, final int denom) { + if ( denom > 0 ) { + return ((double) num)/denom; + } else { + if ( num == 0 && denom == 0) { + return 0.0; + } else { + throw new ReviewedGATKException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + public static double ratio(final long num, final long denom) { + if ( denom > 0L ) { + return ((double) num)/denom; + } else { + if ( num == 0L && denom == 0L ) { + return 0.0; + } else { + throw new ReviewedGATKException(String.format("The denominator of a ratio cannot be zero or less than zero: %d/%d",num,denom)); + } + } + } + + /** + * Converts a real space array of numbers (typically probabilities) into a log10 array + * + * @param prRealSpace + * @return + */ + public static double[] toLog10(final double[] prRealSpace) { + double[] log10s = new double[prRealSpace.length]; + for (int i = 0; i < prRealSpace.length; i++) { + log10s[i] = Math.log10(prRealSpace[i]); + } + return log10s; + } + + public static double log10sumLog10(final double[] log10p, final int start) { + return log10sumLog10(log10p, start, log10p.length); + } + + public static double log10sumLog10(final double[] log10p, final int start, final int finish) { + + if (start >= finish) + return Double.NEGATIVE_INFINITY; + final int maxElementIndex = MathUtils.maxElementIndex(log10p, start, finish); + final double maxValue = log10p[maxElementIndex]; + if(maxValue == Double.NEGATIVE_INFINITY) + return maxValue; + double sum = 1.0; + for (int i = start; i < finish; i++) { + double curVal = log10p[i]; + double scaled_val = curVal - maxValue; + if (i == maxElementIndex || curVal == Double.NEGATIVE_INFINITY) { + continue; + } + else { + sum += Math.pow(10.0, scaled_val); + } + } + if ( Double.isNaN(sum) || sum == Double.POSITIVE_INFINITY ) { + throw new IllegalArgumentException("log10p: Values must be non-infinite and non-NAN"); + } + return maxValue + (sum != 1.0 ? Math.log10(sum) : 0.0); + } + + public static double sumLog10(final double[] log10values) { + return Math.pow(10.0, log10sumLog10(log10values)); + } + + public static double log10sumLog10(final double[] log10values) { + return log10sumLog10(log10values, 0); + } + + public static boolean wellFormedDouble(final double val) { + return !Double.isInfinite(val) && !Double.isNaN(val); + } + + public static double bound(final double value, final double minBoundary, final double maxBoundary) { + return Math.max(Math.min(value, maxBoundary), minBoundary); + } + + public static boolean isBounded(final double val, final double lower, final double upper) { + return val >= lower && val <= upper; + } + + public static boolean isPositive(final double val) { + return !isNegativeOrZero(val); + } + + public static boolean isPositiveOrZero(final double val) { + return isBounded(val, 0.0, Double.POSITIVE_INFINITY); + } + + public static boolean isNegativeOrZero(final double val) { + return isBounded(val, Double.NEGATIVE_INFINITY, 0.0); + } + + public static boolean isNegative(final double val) { + return !isPositiveOrZero(val); + } + + /** + * Compares double values for equality (within 1e-6), or inequality. + * + * @param a the first double value + * @param b the second double value + * @return -1 if a is greater than b, 0 if a is equal to be within 1e-6, 1 if b is greater than a. + */ + public static byte compareDoubles(final double a, final double b) { + return compareDoubles(a, b, 1e-6); + } + + /** + * Compares double values for equality (within epsilon), or inequality. + * + * @param a the first double value + * @param b the second double value + * @param epsilon the precision within which two double values will be considered equal + * @return -1 if a is greater than b, 0 if a is equal to be within epsilon, 1 if b is greater than a. + */ + public static byte compareDoubles(final double a, final double b, final double epsilon) { + if (Math.abs(a - b) < epsilon) { + return 0; + } + if (a > b) { + return -1; + } + return 1; + } + + /** + * Calculate f(x) = Normal(x | mu = mean, sigma = sd) + * @param mean the desired mean of the Normal distribution + * @param sd the desired standard deviation of the Normal distribution + * @param x the value to evaluate + * @return a well-formed double + */ + public static double normalDistribution(final double mean, final double sd, final double x) { + if( sd < 0 ) + throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); + if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) + throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); + double a = 1.0 / (sd * Math.sqrt(2.0 * Math.PI)); + double b = Math.exp(-1.0 * (Math.pow(x - mean, 2.0) / (2.0 * sd * sd))); + return a * b; + } + + /** + * Calculate f(x) = log10 ( Normal(x | mu = mean, sigma = sd) ) + * @param mean the desired mean of the Normal distribution + * @param sd the desired standard deviation of the Normal distribution + * @param x the value to evaluate + * @return a well-formed double + */ + + public static double normalDistributionLog10(final double mean, final double sd, final double x) { + if( sd < 0 ) + throw new IllegalArgumentException("sd: Standard deviation of normal must be >0"); + if ( ! wellFormedDouble(mean) || ! wellFormedDouble(sd) || ! wellFormedDouble(x) ) + throw new IllegalArgumentException("mean, sd, or, x : Normal parameters must be well formatted (non-INF, non-NAN)"); + final double a = -1.0 * Math.log10(sd * SQUARE_ROOT_OF_TWO_TIMES_PI); + final double b = -1.0 * (square(x - mean) / (2.0 * square(sd))) / NATURAL_LOG_OF_TEN; + return a + b; + } + + /** + * Calculate f(x) = x^2 + * @param x the value to square + * @return x * x + */ + public static double square(final double x) { + return x * x; + } + + /** + * Calculates the log10 of the binomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k number of successes + * @return the log10 of the binomial coefficient + */ + public static double binomialCoefficient(final int n, final int k) { + return Math.pow(10, log10BinomialCoefficient(n, k)); + } + + /** + * @see #binomialCoefficient(int, int) with log10 applied to result + */ + public static double log10BinomialCoefficient(final int n, final int k) { + if ( n < 0 ) { + throw new IllegalArgumentException("n: Must have non-negative number of trials"); + } + if ( k > n || k < 0 ) { + throw new IllegalArgumentException("k: Must have non-negative number of successes, and no more successes than number of trials"); + } + + return log10Factorial(n) - log10Factorial(k) - log10Factorial(n - k); + } + + /** + * Computes a binomial probability. This is computed using the formula + *

+ * B(k; n; p) = [ n! / ( k! (n - k)! ) ] (p^k)( (1-p)^k ) + *

+ * where n is the number of trials, k is the number of successes, and p is the probability of success + * + * @param n number of Bernoulli trials + * @param k number of successes + * @param p probability of success + * @return the binomial probability of the specified configuration. Computes values down to about 1e-237. + */ + public static double binomialProbability(final int n, final int k, final double p) { + return Math.pow(10, log10BinomialProbability(n, k, Math.log10(p))); + } + + /** + * @see #binomialProbability(int, int, double) with log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k, final double log10p) { + if ( log10p > 1e-18 ) + throw new IllegalArgumentException("log10p: Log-probability must be 0 or less"); + double log10OneMinusP = Math.log10(1 - Math.pow(10, log10p)); + return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 + */ + public static double binomialProbability(final int n, final int k) { + return Math.pow(10, log10BinomialProbability(n, k)); + } + + /** + * @see #binomialProbability(int, int, double) with p=0.5 and log10 applied to result + */ + public static double log10BinomialProbability(final int n, final int k) { + return log10BinomialCoefficient(n, k) + (n * FAIR_BINOMIAL_PROB_LOG10_0_5); + } + + /** A memoization container for {@link #binomialCumulativeProbability(int, int, int)}. Synchronized to accomodate multithreading. */ + private static final Map BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE = + Collections.synchronizedMap(new LRUCache(10_000)); + + /** + * Primitive integer-triplet bijection into long. Returns null when the bijection function fails (in lieu of an exception), which will + * happen when: any value is negative or larger than a short. This method is optimized for speed; it is not intended to serve as a + * utility function. + */ + static Long fastGenerateUniqueHashFromThreeIntegers(final int one, final int two, final int three) { + if (one < 0 || two < 0 || three < 0 || Short.MAX_VALUE < one || Short.MAX_VALUE < two || Short.MAX_VALUE < three) { + return null; + } else { + long result = 0; + result += (short) one; + result <<= 16; + result += (short) two; + result <<= 16; + result += (short) three; + return result; + } + } + + /** + * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. + * Assumes that the probability of a successful hit is fair (i.e. 0.5). + * + * This pure function is memoized because of its expensive BigDecimal calculations. + * + * @param n number of attempts for the number of hits + * @param k_start start (inclusive) of the cumulant sum (over hits) + * @param k_end end (inclusive) of the cumulant sum (over hits) + * @return - returns the cumulative probability + */ + public static double binomialCumulativeProbability(final int n, final int k_start, final int k_end) { + if ( k_end > n ) + throw new IllegalArgumentException(String.format("Value for k_end (%d) is greater than n (%d)", k_end, n)); + + // Fetch cached value, if applicable. + final Long memoizationKey = fastGenerateUniqueHashFromThreeIntegers(n, k_start, k_end); + final Double memoizationCacheResult; + if (memoizationKey != null) { + memoizationCacheResult = BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.get(memoizationKey); + } else { + memoizationCacheResult = null; + } + + final double result; + if (memoizationCacheResult != null) { + result = memoizationCacheResult; + } else { + double cumProb = 0.0; + double prevProb; + BigDecimal probCache = BigDecimal.ZERO; + + for (int hits = k_start; hits <= k_end; hits++) { + prevProb = cumProb; + final double probability = binomialProbability(n, hits); + cumProb += probability; + if (probability > 0 && cumProb - prevProb < probability / 2) { // loss of precision + probCache = probCache.add(new BigDecimal(prevProb)); + cumProb = 0.0; + hits--; // repeat loop + // prevProb changes at start of loop + } + } + + result = probCache.add(new BigDecimal(cumProb)).doubleValue(); + if (memoizationKey != null) { + BINOMIAL_CUMULATIVE_PROBABILITY_MEMOIZATION_CACHE.put(memoizationKey, result); + } + } + return result; + } + + private static final double LOG1MEXP_THRESHOLD = Math.log(0.5); + + private static final double LN_10 = Math.log(10); + + /** + * Calculates {@code log(1-exp(a))} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + * + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log1mexp(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + + return (a < LOG1MEXP_THRESHOLD) ? Math.log1p(-Math.exp(a)) : Math.log(-Math.expm1(a)); + } + + /** + * Calculates {@code log10(1-10^a)} without loosing precision. + * + *

+ * This is based on the approach described in: + * + *

+ *

+ * Maechler M, Accurately Computing log(1-exp(-|a|)) Assessed by the Rmpfr package, 2012
+ * Online document. + *

+ * + * @param a the input exponent. + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log10OneMinusPow10(final double a) { + if (a > 0) return Double.NaN; + if (a == 0) return Double.NEGATIVE_INFINITY; + final double b = a * LN_10; + return log1mexp(b) / LN_10; + } + + /** + * Calculates the log10 of the multinomial coefficient. Designed to prevent + * overflows even with very large numbers. + * + * @param n total number of trials + * @param k array of any size with the number of successes for each grouping (k1, k2, k3, ..., km) + * @return {@link Double#NaN NaN} if {@code a > 0}, otherwise the corresponding value. + */ + public static double log10MultinomialCoefficient(final int n, final int[] k) { + if ( n < 0 ) + throw new IllegalArgumentException("n: Must have non-negative number of trials"); + double denominator = 0.0; + int sum = 0; + for (int x : k) { + if ( x < 0 ) + throw new IllegalArgumentException("x element of k: Must have non-negative observations of group"); + if ( x > n ) + throw new IllegalArgumentException("x element of k, n: Group observations must be bounded by k"); + denominator += log10Factorial(x); + sum += x; + } + if ( sum != n ) + throw new IllegalArgumentException("k and n: Sum of observations in multinomial must sum to total number of trials"); + return log10Factorial(n) - denominator; + } + + /** + * Computes the log10 of the multinomial distribution probability given a vector + * of log10 probabilities. Designed to prevent overflows even with very large numbers. + * + * @param n number of trials + * @param k array of number of successes for each possibility + * @param log10p array of log10 probabilities + * @return + */ + public static double log10MultinomialProbability(final int n, final int[] k, final double[] log10p) { + if (log10p.length != k.length) + throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + log10p.length + ", " + k.length); + double log10Prod = 0.0; + for (int i = 0; i < log10p.length; i++) { + if ( log10p[i] > 1e-18 ) + throw new IllegalArgumentException("log10p: Log-probability must be <= 0"); + log10Prod += log10p[i] * k[i]; + } + return log10MultinomialCoefficient(n, k) + log10Prod; + } + + /** + * Computes a multinomial coefficient efficiently avoiding overflow even for large numbers. + * This is computed using the formula: + *

+ * M(x1,x2,...,xk; n) = [ n! / (x1! x2! ... xk!) ] + *

+ * where xi represents the number of times outcome i was observed, n is the number of total observations. + * In this implementation, the value of n is inferred as the sum over i of xi. + * + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @return the multinomial of the specified configuration. + */ + public static double multinomialCoefficient(final int[] k) { + int n = 0; + for (int xi : k) { + n += xi; + } + + return Math.pow(10, log10MultinomialCoefficient(n, k)); + } + + /** + * Computes a multinomial probability efficiently avoiding overflow even for large numbers. + * This is computed using the formula: + *

+ * M(x1,x2,...,xk; n; p1,p2,...,pk) = [ n! / (x1! x2! ... xk!) ] (p1^x1)(p2^x2)(...)(pk^xk) + *

+ * where xi represents the number of times outcome i was observed, n is the number of total observations, and + * pi represents the probability of the i-th outcome to occur. In this implementation, the value of n is + * inferred as the sum over i of xi. + * + * @param k an int[] of counts, where each element represents the number of times a certain outcome was observed + * @param p a double[] of probabilities, where each element represents the probability a given outcome can occur + * @return the multinomial probability of the specified configuration. + */ + public static double multinomialProbability(final int[] k, final double[] p) { + if (p.length != k.length) + throw new IllegalArgumentException("p and k: Array of log10 probabilities must have the same size as the array of number of sucesses: " + p.length + ", " + k.length); + + int n = 0; + double[] log10P = new double[p.length]; + for (int i = 0; i < p.length; i++) { + log10P[i] = Math.log10(p[i]); + n += k[i]; + } + return Math.pow(10, log10MultinomialProbability(n, k, log10P)); + } + + /** + * calculate the Root Mean Square of an array of integers + * + * @param x an byte[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final byte[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** + * calculate the Root Mean Square of an array of integers + * + * @param x an int[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final int[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (int i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + /** + * calculate the Root Mean Square of an array of doubles + * + * @param x a double[] of numbers + * @return the RMS of the specified numbers. + */ + public static double rms(final Double[] x) { + if (x.length == 0) + return 0.0; + + double rms = 0.0; + for (Double i : x) + rms += i * i; + rms /= x.length; + return Math.sqrt(rms); + } + + public static double rms(final Collection l) { + if (l.size() == 0) + return 0.0; + + double rms = 0.0; + for (int i : l) + rms += i * i; + rms /= l.size(); + return Math.sqrt(rms); + } + + public static double distanceSquared(final double[] x, final double[] y) { + double dist = 0.0; + for (int iii = 0; iii < x.length; iii++) { + dist += (x[iii] - y[iii]) * (x[iii] - y[iii]); + } + return dist; + } + + public static double round(final double num, final int digits) { + double result = num * Math.pow(10.0, (double) digits); + result = Math.round(result); + result = result / Math.pow(10.0, (double) digits); + return result; + } + + /** + * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). + * + * @param array the array to be normalized + * @param takeLog10OfOutput if true, the output will be transformed back into log10 units + * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed + */ + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput) { + return normalizeFromLog10(array, takeLog10OfOutput, false); + } + + /** + * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space + * + * @param array + * @param takeLog10OfOutput + * @param keepInLogSpace + * + * @return + */ + public static double[] normalizeFromLog10(final double[] array, final boolean takeLog10OfOutput, final boolean keepInLogSpace) { + // for precision purposes, we need to add (or really subtract, since they're + // all negative) the largest value; also, we need to convert to normal-space. + double maxValue = arrayMax(array); + + // we may decide to just normalize in log space without converting to linear space + if (keepInLogSpace) { + for (int i = 0; i < array.length; i++) { + array[i] -= maxValue; + } + return array; + } + + // default case: go to linear space + double[] normalized = new double[array.length]; + + for (int i = 0; i < array.length; i++) + normalized[i] = Math.pow(10, array[i] - maxValue); + + // normalize + double sum = 0.0; + for (int i = 0; i < array.length; i++) + sum += normalized[i]; + for (int i = 0; i < array.length; i++) { + double x = normalized[i] / sum; + if (takeLog10OfOutput) { + x = Math.log10(x); + if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) + x = array[i] - maxValue; + } + + normalized[i] = x; + } + + return normalized; + } + + /** + * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + public static double[] normalizeFromLog10(final double[] array) { + return normalizeFromLog10(array, false); + } + + /** + * normalizes the real-space probability array. + * + * Does not assume anything about the values in the array, beyond that no elements are below 0. It's ok + * to have values in the array of > 1, or have the sum go above 0. + * + * @param array the array to be normalized + * @return a newly allocated array corresponding the normalized values in array + */ + @Requires("array != null") + @Ensures({"result != null"}) + public static double[] normalizeFromRealSpace(final double[] array) { + if ( array.length == 0 ) + return array; + + final double sum = sum(array); + final double[] normalized = new double[array.length]; + if ( sum < 0.0 ) throw new IllegalArgumentException("Values in probability array sum to a negative number " + sum); + for ( int i = 0; i < array.length; i++ ) { + normalized[i] = array[i] / sum; + } + return normalized; + } + + public static int maxElementIndex(final double[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final double[] array, final int start, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + if (start > endIndex) { + throw new IllegalArgumentException("Start cannot be after end."); + } + + int maxI = start; + for (int i = (start+1); i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + return maxI; + } + + public static int maxElementIndex(final double[] array, final int endIndex) { + return maxElementIndex(array, 0, endIndex); + } + + public static int maxElementIndex(final int[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final byte[] array) { + return maxElementIndex(array, array.length); + } + + public static int maxElementIndex(final int[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + return maxI; + } + + public static int maxElementIndex(final byte[] array, final int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static int arrayMax(final int[] array) { + return array[maxElementIndex(array)]; + } + + + public static double arrayMax(final double[] array) { + return array[maxElementIndex(array)]; + } + + public static double arrayMax(final double[] array, final int endIndex) { + return array[maxElementIndex(array, endIndex)]; + } + + public static double arrayMin(final double[] array) { + return array[minElementIndex(array)]; + } + + public static int arrayMin(final int[] array) { + return array[minElementIndex(array)]; + } + + public static byte arrayMin(final byte[] array) { + return array[minElementIndex(array)]; + } + + /** + * Compute the min element of a List + * @param array a non-empty list of integer + * @return the min + */ + public static int arrayMin(final List array) { + if ( array == null || array.isEmpty() ) throw new IllegalArgumentException("Array must be non-null and non-empty"); + int min = array.get(0); + for ( final int i : array ) + if ( i < min ) min = i; + return min; + } + + /** + * Compute the median element of the list of integers + * @param array a list of integers + * @return the median element + */ + public static > T median(final List array) { + /* TODO -- from Valentin + the current implementation is not the usual median when the input is of even length. More concretely it returns the ith element of the list where i = floor(input.size() / 2). + + But actually that is not the "usual" definition of a median, as it is supposed to return the average of the two middle values when the sample length is an even number (i.e. median(1,2,3,4,5,6) == 3.5). [Sources: R and wikipedia] + + My suggestion for a solution is then: + + unify median and medianDoubles to public static T median(Collection) + check on null elements and throw an exception if there are any or perhaps return a null; documented in the javadoc. + relocate, rename and refactor MathUtils.median(X) to Utils.ithElement(X,X.size()/2) + In addition, the current median implementation sorts the whole input list witch is O(n log n). However find out the ith element (thus calculate the median) can be done in O(n) + */ + if ( array == null ) throw new IllegalArgumentException("Array must be non-null"); + final int size = array.size(); + if ( size == 0 ) throw new IllegalArgumentException("Array cannot have size 0"); + else if ( size == 1 ) return array.get(0); + else { + final ArrayList sorted = new ArrayList<>(array); + Collections.sort(sorted); + return sorted.get(size / 2); + } + } + + public static int minElementIndex(final double[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int minElementIndex(final byte[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int minElementIndex(final int[] array) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int minI = 0; + for (int i = 1; i < array.length; i++) { + if (array[i] < array[minI]) + minI = i; + } + + return minI; + } + + public static int arrayMaxInt(final List array) { + if (array == null) + throw new IllegalArgumentException("Array cannot be null!"); + if (array.size() == 0) + throw new IllegalArgumentException("Array size cannot be 0!"); + + int m = array.get(0); + for (int e : array) + m = Math.max(m, e); + return m; + } + + public static int sum(final List list ) { + int sum = 0; + for ( Integer i : list ) { + sum += i; + } + return sum; + } + + public static double average(final List vals, final int maxI) { + long sum = 0L; + + int i = 0; + for (long x : vals) { + if (i > maxI) + break; + sum += x; + i++; + } + + return (1.0 * sum) / i; + } + + public static double average(final List vals) { + return average(vals, vals.size()); + } + + public static int countOccurrences(final char c, final String s) { + int count = 0; + for (int i = 0; i < s.length(); i++) { + count += s.charAt(i) == c ? 1 : 0; + } + return count; + } + + public static int countOccurrences(T x, List l) { + int count = 0; + for (T y : l) { + if (x.equals(y)) + count++; + } + + return count; + } + + public static int countOccurrences(byte element, byte[] array) { + int count = 0; + for (byte y : array) { + if (element == y) + count++; + } + + return count; + } + + public static int countOccurrences(final boolean element, final boolean[] array) { + int count = 0; + for (final boolean b : array) { + if (element == b) + count++; + } + + return count; + } + + + /** + * Returns n random indices drawn with replacement from the range 0..(k-1) + * + * @param n the total number of indices sampled from + * @param k the number of random indices to draw (with replacement) + * @return a list of k random indices ranging from 0 to (n-1) with possible duplicates + */ + static public ArrayList sampleIndicesWithReplacement(final int n, final int k) { + + ArrayList chosen_balls = new ArrayList(k); + for (int i = 0; i < k; i++) { + //Integer chosen_ball = balls[rand.nextInt(k)]; + chosen_balls.add(Utils.getRandomGenerator().nextInt(n)); + //balls.remove(chosen_ball); + } + + return chosen_balls; + } + + /** + * Returns n random indices drawn without replacement from the range 0..(k-1) + * + * @param n the total number of indices sampled from + * @param k the number of random indices to draw (without replacement) + * @return a list of k random indices ranging from 0 to (n-1) without duplicates + */ + static public ArrayList sampleIndicesWithoutReplacement(final int n, final int k) { + ArrayList chosen_balls = new ArrayList(k); + + for (int i = 0; i < n; i++) { + chosen_balls.add(i); + } + + Collections.shuffle(chosen_balls, Utils.getRandomGenerator()); + + //return (ArrayList) chosen_balls.subList(0, k); + return new ArrayList(chosen_balls.subList(0, k)); + } + + /** + * Given a list of indices into a list, return those elements of the list with the possibility of drawing list elements multiple times + * + * @param indices the list of indices for elements to extract + * @param list the list from which the elements should be extracted + * @param the template type of the ArrayList + * @return a new ArrayList consisting of the elements at the specified indices + */ + static public ArrayList sliceListByIndices(final List indices, final List list) { + ArrayList subset = new ArrayList(); + + for (int i : indices) { + subset.add(list.get(i)); + } + + return subset; + } + + /** + * Given two log-probability vectors, compute log of vector product of them: + * in Matlab notation, return log10(10.*x'*10.^y) + * @param x vector 1 + * @param y vector 2 + * @return a double representing log (dotProd(10.^x,10.^y) + */ + public static double logDotProduct(final double [] x, final double[] y) { + if (x.length != y.length) + throw new ReviewedGATKException("BUG: Vectors of different lengths"); + + double tmpVec[] = new double[x.length]; + + for (int k=0; k < tmpVec.length; k++ ) { + tmpVec[k] = x[k]+y[k]; + } + + return log10sumLog10(tmpVec); + + + + } + + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value. By default allows + * -Infinity values, as log10(0.0) == -Infinity. + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return goodLog10Probability(result, true); + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @param allowNegativeInfinity should we consider a -Infinity value ok? + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result, final boolean allowNegativeInfinity) { + return result <= 0.0 && result != Double.POSITIVE_INFINITY && (allowNegativeInfinity || result != Double.NEGATIVE_INFINITY) && ! Double.isNaN(result); + } + + /** + * Checks that the result is a well-formed probability + * + * @param result a supposedly well-formed probability value + * @return true if result is really well formed + */ + public static boolean goodProbability(final double result) { + return result >= 0.0 && result <= 1.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + + /** + * A utility class that computes on the fly average and standard deviation for a stream of numbers. + * The number of observations does not have to be known in advance, and can be also very big (so that + * it could overflow any naive summation-based scheme or cause loss of precision). + * Instead, adding a new number observed + * to a sample with add(observed) immediately updates the instance of this object so that + * it contains correct mean and standard deviation for all the numbers seen so far. Source: Knuth, vol.2 + * (see also e.g. http://www.johndcook.com/standard_deviation.html for online reference). + */ + public static class RunningAverage { + private double mean = 0.0; + private double s = 0.0; + private long obs_count = 0; + + public void add(double obs) { + obs_count++; + double oldMean = mean; + mean += (obs - mean) / obs_count; // update mean + s += (obs - oldMean) * (obs - mean); + } + + public void addAll(Collection col) { + for (Number o : col) { + add(o.doubleValue()); + } + } + + public double mean() { + return mean; + } + + public double stddev() { + return Math.sqrt(s / (obs_count - 1)); + } + + public double var() { + return s / (obs_count - 1); + } + + public long observationCount() { + return obs_count; + } + + public RunningAverage clone() { + RunningAverage ra = new RunningAverage(); + ra.mean = this.mean; + ra.s = this.s; + ra.obs_count = this.obs_count; + return ra; + } + + public void merge(RunningAverage other) { + if (this.obs_count > 0 || other.obs_count > 0) { // if we have any observations at all + this.mean = (this.mean * this.obs_count + other.mean * other.obs_count) / (this.obs_count + other.obs_count); + this.s += other.s; + } + this.obs_count += other.obs_count; + } + } + + // + // useful common utility routines + // + + static public double max(double x0, double x1, double x2) { + double a = Math.max(x0, x1); + return Math.max(a, x2); + } + + /** + * Converts LN to LOG10 + * + * @param ln log(x) + * @return log10(x) + */ + public static double lnToLog10(final double ln) { + return ln * Math.log10(Math.E); + } + + /** + * Constants to simplify the log gamma function calculation. + */ + private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03; + + /** + * Efficient rounding functions to simplify the log gamma function calculation + * double to long with 32 bit shift + */ + private static final int HI(final double x) { + return (int) (Double.doubleToLongBits(x) >> 32); + } + + /** + * Efficient rounding functions to simplify the log gamma function calculation + * double to long without shift + */ + private static final int LO(final double x) { + return (int) Double.doubleToLongBits(x); + } + + /** + * Most efficent implementation of the lnGamma (FDLIBM) + * Use via the log10Gamma wrapper method. + */ + private static double lnGamma(final double x) { + double t, y, z, p, p1, p2, p3, q, r, w; + int i; + + int hx = HI(x); + int lx = LO(x); + + /* purge off +-inf, NaN, +-0, and negative arguments */ + int ix = hx & 0x7fffffff; + if (ix >= 0x7ff00000) + return Double.POSITIVE_INFINITY; + if ((ix | lx) == 0 || hx < 0) + return Double.NaN; + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + return -Math.log(x); + } + + /* purge off 1 and 2 */ + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) + r = 0; + /* for x < 2.0 */ + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + r = -Math.log(x); + if (ix >= 0x3FE76944) { + y = one - x; + i = 0; + } + else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } + else { + y = x; + i = 2; + } + } + else { + r = zero; + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ + else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ + else { + y = x - one; + i = 2; + } + } + + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); + } + } + else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int) x; + t = zero; + y = x - (double) i; + p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += Math.log(z); + break; + } + /* 8.0 <= x < 2**58 */ + } + else if (ix < 0x43900000) { + t = Math.log(x); + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; + } + else + /* 2**58 <= x <= inf */ + r = x * (Math.log(x) - one); + return r; + } + + /** + * Calculates the log10 of the gamma function for x using the efficient FDLIBM + * implementation to avoid overflows and guarantees high accuracy even for large + * numbers. + * + * @param x the x parameter + * @return the log10 of the gamma function at x. + */ + public static double log10Gamma(final double x) { + return lnToLog10(lnGamma(x)); + } + + public static double factorial(final int x) { + // avoid rounding errors caused by fact that 10^log(x) might be slightly lower than x and flooring may produce 1 less than real value + return (double)Math.round(Math.pow(10, log10Factorial(x))); + } + + public static double log10Factorial(final int x) { + if (x >= Log10FactorialCache.size() || x < 0) + return log10Gamma(x + 1); + else + return Log10FactorialCache.get(x); + } + + /** + * Wrapper class so that the log10Factorial array is only calculated if it's used + */ + private static class Log10FactorialCache { + + /** + * The size of the precomputed cache. Must be a positive number! + */ + private static final int CACHE_SIZE = 10_000; + + public static int size() { return CACHE_SIZE; } + + public static double get(final int n) { + if (cache == null) + initialize(); + return cache[n]; + } + + private static synchronized void initialize() { + if (cache == null) { + Log10Cache.ensureCacheContains(CACHE_SIZE); + cache = new double[CACHE_SIZE]; + cache[0] = 0.0; + for (int k = 1; k < cache.length; k++) + cache[k] = cache[k-1] + Log10Cache.get(k); + } + } + + private static double[] cache = null; + } + + /** + * Adds two arrays together and returns a new array with the sum. + * + * @param a one array + * @param b another array + * @return a new array with the sum of a and b + */ + @Requires("a.length == b.length") + @Ensures("result.length == a.length") + public static int[] addArrays(final int[] a, final int[] b) { + int[] c = new int[a.length]; + for (int i = 0; i < a.length; i++) + c[i] = a[i] + b[i]; + return c; + } + + /** Same routine, unboxed types for efficiency + * + * @param x First vector + * @param y Second vector + * @return Vector of same length as x and y so that z[k] = x[k]+y[k] + */ + public static double[] vectorSum(final double[]x, final double[] y) { + if (x.length != y.length) + throw new ReviewedGATKException("BUG: Lengths of x and y must be the same"); + + double[] result = new double[x.length]; + for (int k=0; k log10LinearRange(final int start, final int stop, final double eps) { + final LinkedList values = new LinkedList<>(); + final double log10range = Math.log10(stop - start); + + if ( start == 0 ) + values.add(0); + + double i = 0.0; + while ( i <= log10range ) { + final int index = (int)Math.round(Math.pow(10, i)) + start; + if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) + values.add(index); + i += eps; + } + + if ( values.peekLast() == null || values.peekLast() != stop ) + values.add(stop); + + return values; + } + + /** + * Compute in a numerical correct way the quantity log10(1-x) + * + * Uses the approximation log10(1-x) = log10(1/x - 1) + log10(x) to avoid very quick underflow + * in 1-x when x is very small + * + * @param x a positive double value between 0.0 and 1.0 + * @return an estimate of log10(1-x) + */ + @Requires("x >= 0.0 && x <= 1.0") + @Ensures("result <= 0.0") + public static double log10OneMinusX(final double x) { + if ( x == 1.0 ) + return Double.NEGATIVE_INFINITY; + else if ( x == 0.0 ) + return 0.0; + else { + final double d = Math.log10(1 / x - 1) + Math.log10(x); + return Double.isInfinite(d) || d > 0.0 ? 0.0 : d; + } + } + + /** + * Draw N random elements from list + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSubset(final List list, final int N) { + if (list.size() <= N) { + return list; + } + + return sliceListByIndices(sampleIndicesWithoutReplacement(list.size(),N),list); + } + + /** + * Draw N random elements from list with replacement + * @param list - the list from which to draw randomly + * @param N - the number of elements to draw + */ + public static List randomSample(final List list, final int N) { + if (list.isEmpty() ) { + return list; + } + return sliceListByIndices(sampleIndicesWithReplacement(list.size(),N),list); + } + + /** + * Return the likelihood of observing the counts of categories having sampled a population + * whose categorial frequencies are distributed according to a Dirichlet distribution + * @param dirichletParams - params of the prior dirichlet distribution + * @param dirichletSum - the sum of those parameters + * @param counts - the counts of observation in each category + * @param countSum - the sum of counts (number of trials) + * @return - associated likelihood + */ + public static double dirichletMultinomial(final double[] dirichletParams, final double dirichletSum, + final int[] counts, final int countSum) { + if ( dirichletParams.length != counts.length ) { + throw new IllegalStateException("The number of dirichlet parameters must match the number of categories"); + } + // todo -- lots of lnGammas here. At some point we can safely switch to x * ( ln(x) - 1) + double likelihood = log10MultinomialCoefficient(countSum,counts); + likelihood += log10Gamma(dirichletSum); + likelihood -= log10Gamma(dirichletSum+countSum); + for ( int idx = 0; idx < counts.length; idx++ ) { + likelihood += log10Gamma(counts[idx] + dirichletParams[idx]); + likelihood -= log10Gamma(dirichletParams[idx]); + } + + return likelihood; + } + + public static double dirichletMultinomial(double[] params, int[] counts) { + return dirichletMultinomial(params,sum(params),counts,(int) sum(counts)); + } + + public static ExponentialDistribution exponentialDistribution( final double mean ) { + return new ExponentialDistributionImpl(mean); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Median.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Median.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/Median.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Median.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MultiThreadedErrorTracker.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MultiThreadedErrorTracker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/MultiThreadedErrorTracker.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/MultiThreadedErrorTracker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/NGSPlatform.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/NGSPlatform.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/NGSPlatform.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/NGSPlatform.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/PathUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/PathUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/PathUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/PathUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/QualityUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/QualityUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/QualityUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/QualityUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutor.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutor.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutor.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutor.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutorException.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutorException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutorException.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RScriptExecutorException.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RScriptLibrary.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RScriptLibrary.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RScriptLibrary.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RScriptLibrary.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/R/RUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/R/RUtils.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java new file mode 100644 index 000000000..22c7127c2 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtils.java @@ -0,0 +1,526 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Sep 10, 2010 + * Time: 1:56:24 PM + * + * A series of utility functions that enable the GATK to compare two sequence dictionaries -- from the reference, + * from BAMs, or from RODs -- for consistency. The system supports two basic modes: get an enum state that + * describes at a high level the consistency between two dictionaries, or a validateDictionaries that will + * blow up with a UserException if the dicts are too incompatible. + * + * Dictionaries are tested for contig name overlaps, consistency in ordering in these overlap set, and length, + * if available. Examines the Engine arguments to decided if the -U option to allow danger seq dict inconsistency + * is enabled before it blows up. + */ +public class SequenceDictionaryUtils { + // + // for detecting lexicographically sorted human references + // + private static final boolean ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN = true; + + // hg18 + protected static final SAMSequenceRecord CHR1_HG18 = new SAMSequenceRecord("chr1", 247249719); + protected static final SAMSequenceRecord CHR2_HG18 = new SAMSequenceRecord("chr2", 242951149); + protected static final SAMSequenceRecord CHR10_HG18 = new SAMSequenceRecord("chr10", 135374737); + + // hg19 + protected static final SAMSequenceRecord CHR1_HG19 = new SAMSequenceRecord("chr1", 249250621); + protected static final SAMSequenceRecord CHR2_HG19 = new SAMSequenceRecord("chr2", 243199373); + protected static final SAMSequenceRecord CHR10_HG19 = new SAMSequenceRecord("chr10", 135534747); + + // b36 + protected static final SAMSequenceRecord CHR1_B36 = new SAMSequenceRecord("1", 247249719); + protected static final SAMSequenceRecord CHR2_B36 = new SAMSequenceRecord("2", 242951149); + protected static final SAMSequenceRecord CHR10_B36 = new SAMSequenceRecord("10", 135374737); + + // b37 + protected static final SAMSequenceRecord CHR1_B37 = new SAMSequenceRecord("1", 249250621); + protected static final SAMSequenceRecord CHR2_B37 = new SAMSequenceRecord("2", 243199373); + protected static final SAMSequenceRecord CHR10_B37 = new SAMSequenceRecord("10", 135534747); + + + public enum SequenceDictionaryCompatibility { + IDENTICAL, // the dictionaries are identical + COMMON_SUBSET, // there exists a common subset of equivalent contigs + NO_COMMON_CONTIGS, // no overlap between dictionaries + UNEQUAL_COMMON_CONTIGS, // common subset has contigs that have the same name but different lengths + NON_CANONICAL_HUMAN_ORDER, // human reference detected but the order of the contigs is non-standard (lexicographic, for examine) + OUT_OF_ORDER, // the two dictionaries overlap but the overlapping contigs occur in different + // orders with respect to each other + DIFFERENT_INDICES // the two dictionaries overlap and the overlapping contigs occur in the same + // order with respect to each other, but one or more of them have different + // indices in the two dictionaries. Eg., { chrM, chr1, chr2 } vs. { chr1, chr2 } + } + + /** + * @param validationExclusion exclusions to validation + * @return Returns true if the engine is in tolerant mode and we'll let through dangerous but not fatal dictionary inconsistency + */ + private static boolean allowNonFatalIncompabilities(ValidationExclusion.TYPE validationExclusion) { + return ( validationExclusion == ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY || + validationExclusion == ValidationExclusion.TYPE.ALL ); + } + + /** + * Tests for compatibility between two sequence dictionaries. If the dictionaries are incompatible, then + * UserExceptions are thrown with detailed error messages. If the engine is in permissive mode, then + * logger warnings are generated instead. + * + * @param logger for warnings + * @param validationExclusion exclusions to validation + * @param name1 name associated with dict1 + * @param dict1 the sequence dictionary dict1 + * @param name2 name associated with dict2 + * @param dict2 the sequence dictionary dict2 + * @param isReadsToReferenceComparison true if one of the dictionaries comes from a reads data source (eg., a BAM), + * and the other from a reference data source + * @param intervals the user-specified genomic intervals: only required when isReadsToReferenceComparison is true, + * otherwise can be null + */ + public static void validateDictionaries( final Logger logger, + final ValidationExclusion.TYPE validationExclusion, + final String name1, + final SAMSequenceDictionary dict1, + final String name2, + final SAMSequenceDictionary dict2, + final boolean isReadsToReferenceComparison, + final GenomeLocSortedSet intervals ) { + + final SequenceDictionaryCompatibility type = compareDictionaries(dict1, dict2); + + switch ( type ) { + case IDENTICAL: + return; + case COMMON_SUBSET: + return; + case NO_COMMON_CONTIGS: + throw new UserException.IncompatibleSequenceDictionaries("No overlapping contigs found", name1, dict1, name2, dict2); + + case UNEQUAL_COMMON_CONTIGS: { + List x = findDisequalCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); + SAMSequenceRecord elt1 = x.get(0); + SAMSequenceRecord elt2 = x.get(1); + + // todo -- replace with toString when SAMSequenceRecord has a nice toString routine + UserException ex = new UserException.IncompatibleSequenceDictionaries(String.format("Found contigs with the same name but different lengths:\n contig %s = %s / %d\n contig %s = %s / %d", + name1, elt1.getSequenceName(), elt1.getSequenceLength(), + name2, elt2.getSequenceName(), elt2.getSequenceLength()), + name1, dict1, name2, dict2); + + if ( allowNonFatalIncompabilities(validationExclusion) ) + logger.warn(ex.getMessage()); + else + throw ex; + break; + } + + case NON_CANONICAL_HUMAN_ORDER: { + UserException ex; + if ( nonCanonicalHumanContigOrder(dict1) ) + ex = new UserException.LexicographicallySortedSequenceDictionary(name1, dict1); + else + ex = new UserException.LexicographicallySortedSequenceDictionary(name2, dict2); + + if ( allowNonFatalIncompabilities(validationExclusion) ) + logger.warn(ex.getMessage()); + else + throw ex; + break; + } + + case OUT_OF_ORDER: { + UserException ex = new UserException.IncompatibleSequenceDictionaries("Relative ordering of overlapping contigs differs, which is unsafe", name1, dict1, name2, dict2); + if ( allowNonFatalIncompabilities(validationExclusion) ) + logger.warn(ex.getMessage()); + else + throw ex; + break; + } + + case DIFFERENT_INDICES: { + // This is currently only known to be problematic when the index mismatch is between a bam and the + // reference AND when the user's intervals actually include one or more of the contigs that are + // indexed differently from the reference. In this case, the engine will fail to correctly serve + // up the reads from those contigs, so throw an exception unless unsafe operations are enabled. + if ( isReadsToReferenceComparison && intervals != null ) { + + final Set misindexedContigs = findMisindexedContigsInIntervals(intervals, dict1, dict2); + + if ( ! misindexedContigs.isEmpty() ) { + final String msg = String.format("The following contigs included in the intervals to process have " + + "different indices in the sequence dictionaries for the reads vs. " + + "the reference: %s. As a result, the GATK engine will not correctly " + + "process reads from these contigs. You should either fix the sequence " + + "dictionaries for your reads so that these contigs have the same indices " + + "as in the sequence dictionary for your reference, or exclude these contigs " + + "from your intervals. This error can be disabled via -U %s, " + + "however this is not recommended as the GATK engine will not behave correctly.", + misindexedContigs, ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY); + final UserException ex = new UserException.IncompatibleSequenceDictionaries(msg, name1, dict1, name2, dict2); + + if ( allowNonFatalIncompabilities(validationExclusion) ) + logger.warn(ex.getMessage()); + else + throw ex; + } + } + break; + } + + default: + throw new ReviewedGATKException("Unexpected SequenceDictionaryComparison type: " + type); + } + } + + /** + * Workhorse routine that takes two dictionaries and returns their compatibility. + * + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return A SequenceDictionaryCompatibility enum value describing the compatibility of the two dictionaries + */ + public static SequenceDictionaryCompatibility compareDictionaries( final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2) { + if ( nonCanonicalHumanContigOrder(dict1) || nonCanonicalHumanContigOrder(dict2) ) + return SequenceDictionaryCompatibility.NON_CANONICAL_HUMAN_ORDER; + + final Set commonContigs = getCommonContigsByName(dict1, dict2); + + if (commonContigs.size() == 0) + return SequenceDictionaryCompatibility.NO_COMMON_CONTIGS; + else if ( ! commonContigsHaveSameLengths(commonContigs, dict1, dict2) ) + return SequenceDictionaryCompatibility.UNEQUAL_COMMON_CONTIGS; + else if ( ! commonContigsAreInSameRelativeOrder(commonContigs, dict1, dict2) ) + return SequenceDictionaryCompatibility.OUT_OF_ORDER; + else if ( commonContigs.size() == dict1.size() && commonContigs.size() == dict2.size() ) + return SequenceDictionaryCompatibility.IDENTICAL; + else if ( ! commonContigsAreAtSameIndices(commonContigs, dict1, dict2) ) + return SequenceDictionaryCompatibility.DIFFERENT_INDICES; + else { + return SequenceDictionaryCompatibility.COMMON_SUBSET; + } + } + + /** + * Utility function that tests whether the commonContigs in both dicts are equivalent. Equivalence means + * that the seq records have the same length, if both are non-zero. + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return true if all of the common contigs are equivalent + */ + private static boolean commonContigsHaveSameLengths(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + return findDisequalCommonContigs(commonContigs, dict1, dict2) == null; + } + + /** + * Returns a List(x,y) that contains two disequal sequence records among the common contigs in both dicts. Returns + * null if all common contigs are equivalent + * + * @param commonContigs + * @param dict1 + * @param dict2 + * @return + */ + private static List findDisequalCommonContigs(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + for ( String name : commonContigs ) { + SAMSequenceRecord elt1 = dict1.getSequence(name); + SAMSequenceRecord elt2 = dict2.getSequence(name); + if ( ! sequenceRecordsAreEquivalent(elt1, elt2) ) + return Arrays.asList(elt1,elt2); + } + + return null; + } + + /** + * Helper routine that returns two sequence records are equivalent, defined as having the same name and + * lengths, if both are non-zero + * + * @param me + * @param that + * @return + */ + private static boolean sequenceRecordsAreEquivalent(final SAMSequenceRecord me, final SAMSequenceRecord that) { + if (me == that) return true; + if (that == null) return false; + + if (me.getSequenceLength() != 0 && that.getSequenceLength() != 0 && me.getSequenceLength() != that.getSequenceLength()) + return false; + + // todo -- reenable if we want to be really strict here +// if (me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null && that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG) != null) { +// final BigInteger thisMd5 = new BigInteger((String)me.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); +// final BigInteger thatMd5 = new BigInteger((String)that.getExtendedAttribute(SAMSequenceRecord.MD5_TAG), 16); +// if (!thisMd5.equals(thatMd5)) { +// return false; +// } +// } +// else { + if (me.getSequenceName() != that.getSequenceName()) + return false; // Compare using == since we intern() the Strings +// } + + return true; + } + + /** + * A very simple (and naive) algorithm to determine (1) if the dict is a human reference (hg18/hg19) and if it's + * lexicographically sorted. Works by matching lengths of the static chr1, chr10, and chr2, and then if these + * are all matched, requiring that the order be chr1, chr2, chr10. + * + * @param dict + * @return + */ + private static boolean nonCanonicalHumanContigOrder(SAMSequenceDictionary dict) { + if ( ! ENABLE_LEXICOGRAPHIC_REQUIREMENT_FOR_HUMAN ) // if we don't want to enable this test, just return false + return false; + + SAMSequenceRecord chr1 = null, chr2 = null, chr10 = null; + + for ( SAMSequenceRecord elt : dict.getSequences() ) { + if ( isHumanSeqRecord(elt, CHR1_HG18, CHR1_HG19 ) ) chr1 = elt; + if ( isHumanSeqRecord(elt, CHR2_HG18, CHR2_HG19 ) ) chr2 = elt; + if ( isHumanSeqRecord(elt, CHR10_HG18, CHR10_HG19 ) ) chr10 = elt; + } + + if ( chr1 != null && chr2 != null && chr10 != null) { + // we found them all + return ! ( chr1.getSequenceIndex() < chr2.getSequenceIndex() && chr2.getSequenceIndex() < chr10.getSequenceIndex() ); + } else { + return false; + } + } + + /** + * Trivial helper that returns true if elt has the same length as rec1 or rec2 + * @param elt record to test + * @param rec1 first record to test for length equivalence + * @param rec2 first record to test for length equivalence + * @return true if elt has the same length as either rec1 or rec2 + */ + private static boolean isHumanSeqRecord(SAMSequenceRecord elt, SAMSequenceRecord rec1, SAMSequenceRecord rec2 ) { + return elt.getSequenceLength() == rec1.getSequenceLength() || elt.getSequenceLength() == rec2.getSequenceLength(); + } + + /** + * Returns true if the common contigs in dict1 and dict2 are in the same relative order, without regard to + * absolute index position. This is accomplished by getting the common contigs in both dictionaries, sorting + * these according to their indices, and then walking through the sorted list to ensure that each ordered contig + * is equivalent + * + * @param commonContigs names of the contigs common to both dictionaries + * @param dict1 first SAMSequenceDictionary + * @param dict2 second SAMSequenceDictionary + * @return true if the common contigs occur in the same relative order in both dict1 and dict2, otherwise false + */ + private static boolean commonContigsAreInSameRelativeOrder(Set commonContigs, SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + List list1 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict1)); + List list2 = sortSequenceListByIndex(getSequencesOfName(commonContigs, dict2)); + + for ( int i = 0; i < list1.size(); i++ ) { + SAMSequenceRecord elt1 = list1.get(i); + SAMSequenceRecord elt2 = list2.get(i); + if ( ! elt1.getSequenceName().equals(elt2.getSequenceName()) ) + return false; + } + + return true; + } + + /** + * Gets the subset of SAMSequenceRecords in commonContigs in dict + * + * @param commonContigs + * @param dict + * @return + */ + private static List getSequencesOfName(Set commonContigs, SAMSequenceDictionary dict) { + List l = new ArrayList(commonContigs.size()); + for ( String name : commonContigs ) { + l.add(dict.getSequence(name) ); + } + + return l; + } + + /** + * Compares sequence records by their order + */ + private static class CompareSequenceRecordsByIndex implements Comparator { + public int compare(SAMSequenceRecord x, SAMSequenceRecord y) { + return Integer.valueOf(x.getSequenceIndex()).compareTo(y.getSequenceIndex()); + } + } + + /** + * Returns a sorted list of SAMSequenceRecords sorted by their indices. Note that the + * list is modified in place, so the returned list is == to the unsorted list. + * + * @param unsorted + * @return + */ + private static List sortSequenceListByIndex(List unsorted) { + Collections.sort(unsorted, new CompareSequenceRecordsByIndex()); + return unsorted; + } + + /** + * Checks whether the common contigs in the given sequence dictionaries occur at the same indices + * in both dictionaries + * + * @param commonContigs Set of names of the contigs that occur in both dictionaries + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return true if the contigs common to dict1 and dict2 occur at the same indices in both dictionaries, + * otherwise false + */ + private static boolean commonContigsAreAtSameIndices( final Set commonContigs, final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2 ) { + for ( String commonContig : commonContigs ) { + SAMSequenceRecord dict1Record = dict1.getSequence(commonContig); + SAMSequenceRecord dict2Record = dict2.getSequence(commonContig); + + // Each common contig must have the same index in both dictionaries + if ( dict1Record.getSequenceIndex() != dict2Record.getSequenceIndex() ) { + return false; + } + } + + return true; + } + + /** + * Gets the set of names of the contigs found in both sequence dictionaries that have different indices + * in the two dictionaries. + * + * @param commonContigs Set of names of the contigs common to both dictionaries + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return a Set containing the names of the common contigs indexed differently in dict1 vs. dict2, + * or an empty Set if there are no such contigs + */ + private static Set getDifferentlyIndexedCommonContigs( final Set commonContigs, + final SAMSequenceDictionary dict1, + final SAMSequenceDictionary dict2 ) { + + final Set differentlyIndexedCommonContigs = new LinkedHashSet(Utils.optimumHashSize(commonContigs.size())); + + for ( String commonContig : commonContigs ) { + if ( dict1.getSequence(commonContig).getSequenceIndex() != dict2.getSequence(commonContig).getSequenceIndex() ) { + differentlyIndexedCommonContigs.add(commonContig); + } + } + + return differentlyIndexedCommonContigs; + } + + /** + * Finds the names of any contigs indexed differently in the two sequence dictionaries that also + * occur in the provided set of intervals. + * + * @param intervals GenomeLocSortedSet containing the intervals to check + * @param dict1 first sequence dictionary + * @param dict2 second sequence dictionary + * @return a Set of the names of the contigs indexed differently in dict1 vs dict2 that also + * occur in the provided intervals, or an empty Set if there are no such contigs + */ + private static Set findMisindexedContigsInIntervals( final GenomeLocSortedSet intervals, + final SAMSequenceDictionary dict1, + final SAMSequenceDictionary dict2 ) { + + final Set differentlyIndexedCommonContigs = getDifferentlyIndexedCommonContigs(getCommonContigsByName(dict1, dict2), dict1, dict2); + final Set misindexedContigsInIntervals = new LinkedHashSet(Utils.optimumHashSize(differentlyIndexedCommonContigs.size())); + + // We know differentlyIndexedCommonContigs is a HashSet, so this loop is O(intervals) + for ( GenomeLoc interval : intervals ) { + if ( differentlyIndexedCommonContigs.contains(interval.getContig()) ) { + misindexedContigsInIntervals.add(interval.getContig()); + } + } + + return misindexedContigsInIntervals; + } + + /** + * Returns the set of contig names found in both dicts. + * @param dict1 + * @param dict2 + * @return + */ + public static Set getCommonContigsByName(SAMSequenceDictionary dict1, SAMSequenceDictionary dict2) { + Set intersectingSequenceNames = getContigNames(dict1); + intersectingSequenceNames.retainAll(getContigNames(dict2)); + return intersectingSequenceNames; + } + + public static Set getContigNames(SAMSequenceDictionary dict) { + Set contigNames = new HashSet(Utils.optimumHashSize(dict.size())); + for (SAMSequenceRecord dictionaryEntry : dict.getSequences()) + contigNames.add(dictionaryEntry.getSequenceName()); + return contigNames; + } + + /** + * Returns a compact String representation of the sequence dictionary it's passed + * + * The format of the returned String is: + * [ contig1Name(length: contig1Length) contig2Name(length: contig2Length) ... ] + * + * @param dict a non-null SAMSequenceDictionary + * @return A String containing all of the contig names and lengths from the sequence dictionary it's passed + */ + public static String getDictionaryAsString( final SAMSequenceDictionary dict ) { + if ( dict == null ) { + throw new IllegalArgumentException("Sequence dictionary must be non-null"); + } + + StringBuilder s = new StringBuilder("[ "); + + for ( SAMSequenceRecord dictionaryEntry : dict.getSequences() ) { + s.append(dictionaryEntry.getSequenceName()); + s.append("(length:"); + s.append(dictionaryEntry.getSequenceLength()); + s.append(") "); + } + + s.append("]"); + + return s.toString(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/UnvalidatingGenomeLoc.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/UnvalidatingGenomeLoc.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/UnvalidatingGenomeLoc.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/UnvalidatingGenomeLoc.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java new file mode 100644 index 000000000..408fd9e9f --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/Utils.java @@ -0,0 +1,1117 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; + +import java.lang.reflect.Array; +import java.math.BigInteger; +import java.net.InetAddress; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Feb 24, 2009 + * Time: 10:12:31 AM + * To change this template use File | Settings | File Templates. + */ +public class Utils { + /** + * Static random number generator and seed. + */ + private static final long GATK_RANDOM_SEED = 47382911L; + private static Random randomGenerator = new Random(GATK_RANDOM_SEED); + public static Random getRandomGenerator() { return randomGenerator; } + public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); } + public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); } + + /** our log, which we want to capture anything from this class */ + private static Logger logger = Logger.getLogger(Utils.class); + + public static final float JAVA_DEFAULT_HASH_LOAD_FACTOR = 0.75f; + + /** + * Boolean xor operation. Only true if x != y. + * + * @param x a boolean + * @param y a boolean + * @return true if x != y + */ + public static boolean xor(final boolean x, final boolean y) { + return x != y; + } + + /** + * Calculates the optimum initial size for a hash table given the maximum number + * of elements it will need to hold. The optimum size is the smallest size that + * is guaranteed not to result in any rehash/table-resize operations. + * + * @param maxElements The maximum number of elements you expect the hash table + * will need to hold + * @return The optimum initial size for the table, given maxElements + */ + public static int optimumHashSize ( int maxElements ) { + return (int)(maxElements / JAVA_DEFAULT_HASH_LOAD_FACTOR) + 2; + } + + /** + * Compares two objects, either of which might be null. + * + * @param lhs One object to compare. + * @param rhs The other object to compare. + * + * @return True if the two objects are equal, false otherwise. + */ + public static boolean equals(Object lhs, Object rhs) { + return lhs == null && rhs == null || lhs != null && lhs.equals(rhs); + } + + public static List cons(final T elt, final List l) { + List l2 = new ArrayList(); + l2.add(elt); + if (l != null) l2.addAll(l); + return l2; + } + + public static void warnUser(final String msg) { + warnUser(logger, msg); + } + + public static void warnUser(final Logger logger, final String msg) { + logger.warn(String.format("********************************************************************************")); + logger.warn(String.format("* WARNING:")); + logger.warn(String.format("*")); + prettyPrintWarningMessage(logger, msg); + logger.warn(String.format("********************************************************************************")); + } + + /** + * pretty print the warning message supplied + * + * @param logger logger for the message + * @param message the message + */ + private static void prettyPrintWarningMessage(Logger logger, String message) { + StringBuilder builder = new StringBuilder(message); + while (builder.length() > 70) { + int space = builder.lastIndexOf(" ", 70); + if (space <= 0) space = 70; + logger.warn(String.format("* %s", builder.substring(0, space))); + builder.delete(0, space + 1); + } + logger.warn(String.format("* %s", builder)); + } + + /** + * join the key value pairs of a map into one string, i.e. myMap = [A->1,B->2,C->3] with a call of: + * joinMap("-","*",myMap) -> returns A-1*B-2*C-3 + * + * Be forewarned, if you're not using a map that is aware of the ordering (i.e. HashMap instead of LinkedHashMap) + * the ordering of the string you get back might not be what you expect! (i.e. C-3*A-1*B-2 vrs A-1*B-2*C-3) + * + * @param keyValueSeperator the string to seperate the key-value pairs + * @param recordSeperator the string to use to seperate each key-value pair from other key-value pairs + * @param map the map to draw from + * @param the map's key type + * @param the map's value type + * @return a string representing the joined map + */ + public static String joinMap(String keyValueSeperator, String recordSeperator, Map map) { + if (map.size() < 1) { return null; } + String joinedKeyValues[] = new String[map.size()]; + int index = 0; + for (L key : map.keySet()) { + joinedKeyValues[index++] = String.format("%s%s%s",key.toString(),keyValueSeperator,map.get(key).toString()); + } + return join(recordSeperator,joinedKeyValues); + } + + /** + * Splits a String using indexOf instead of regex to speed things up. + * + * @param str the string to split. + * @param delimiter the delimiter used to split the string. + * @return an array of tokens. + */ + public static ArrayList split(String str, String delimiter) { + return split(str, delimiter, 10); + } + + /** + * Splits a String using indexOf instead of regex to speed things up. + * + * @param str the string to split. + * @param delimiter the delimiter used to split the string. + * @param expectedNumTokens The number of tokens expected. This is used to initialize the ArrayList. + * @return an array of tokens. + */ + public static ArrayList split(String str, String delimiter, int expectedNumTokens) { + final ArrayList result = new ArrayList(expectedNumTokens); + + int delimiterIdx = -1; + do { + final int tokenStartIdx = delimiterIdx + 1; + delimiterIdx = str.indexOf(delimiter, tokenStartIdx); + final String token = (delimiterIdx != -1 ? str.substring(tokenStartIdx, delimiterIdx) : str.substring(tokenStartIdx) ); + result.add(token); + } while( delimiterIdx != -1 ); + + return result; + } + + + /** + * join an array of strings given a seperator + * @param separator the string to insert between each array element + * @param strings the array of strings + * @return a string, which is the joining of all array values with the separator + */ + public static String join(String separator, String[] strings) { + return join(separator, strings, 0, strings.length); + } + + public static String join(String separator, String[] strings, int start, int end) { + if ((end - start) == 0) { + return ""; + } + StringBuilder ret = new StringBuilder(strings[start]); + for (int i = start + 1; i < end; ++i) { + ret.append(separator); + ret.append(strings[i]); + } + return ret.toString(); + } + + public static String join(String separator, int[] ints) { + if ( ints == null || ints.length == 0) + return ""; + else { + StringBuilder ret = new StringBuilder(); + ret.append(ints[0]); + for (int i = 1; i < ints.length; ++i) { + ret.append(separator); + ret.append(ints[i]); + } + return ret.toString(); + } + } + + /** + * Create a new list that contains the elements of left along with elements elts + * @param left a non-null list of elements + * @param elts a varargs vector for elts to append in order to left + * @return A newly allocated linked list containing left followed by elts + */ + public static List append(final List left, T ... elts) { + final List l = new LinkedList(left); + l.addAll(Arrays.asList(elts)); + return l; + } + + /** + * Returns a string of the values in joined by separator, such as A,B,C + * + * @param separator separator character + * @param doubles the array with values + * @return a string with the values separated by the separator + */ + public static String join(String separator, double[] doubles) { + if ( doubles == null || doubles.length == 0) + return ""; + else { + StringBuilder ret = new StringBuilder(); + ret.append(doubles[0]); + for (int i = 1; i < doubles.length; ++i) { + ret.append(separator); + ret.append(doubles[i]); + } + return ret.toString(); + } + } + + /** + * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of + * elti objects (note there's no actual space between sep and the elti elements). Returns + * "" if collection is empty. If collection contains just elt, then returns elt.toString() + * + * @param separator the string to use to separate objects + * @param objects a collection of objects. the element order is defined by the iterator over objects + * @param the type of the objects + * @return a non-null string + */ + public static String join(final String separator, final Collection objects) { + if (objects.isEmpty()) { // fast path for empty collection + return ""; + } else { + final Iterator iter = objects.iterator(); + final T first = iter.next(); + + if ( ! iter.hasNext() ) // fast path for singleton collections + return first.toString(); + else { // full path for 2+ collection that actually need a join + final StringBuilder ret = new StringBuilder(first.toString()); + while(iter.hasNext()) { + ret.append(separator); + ret.append(iter.next().toString()); + } + return ret.toString(); + } + } + } + + /** + * Returns a {@link List List<Integer>} representation of an primitive int array. + * @param values the primitive int array to represent. + * @return never code {@code null}. The returned list will be unmodifiable yet it will reflect changes in values in the original array yet + * you cannot change the values + */ + public static List asList(final int ... values) { + if (values == null) + throw new IllegalArgumentException("the input array cannot be null"); + return new AbstractList() { + + @Override + public Integer get(final int index) { + return values[index]; + } + + @Override + public int size() { + return values.length; + } + }; + } + + /** + * Returns a {@link List List<Double>} representation of an primitive double array. + * @param values the primitive int array to represent. + * @return never code {@code null}. The returned list will be unmodifiable yet it will reflect changes in values in the original array yet + * you cannot change the values. + */ + public static List asList(final double ... values) { + if (values == null) + throw new IllegalArgumentException("the input array cannot be null"); + return new AbstractList() { + + @Override + public Double get(final int index) { + return values[index]; + } + + @Override + public int size() { + return values.length; + } + }; + } + + public static String join(final String separator, final T ... objects) { + return join(separator, Arrays.asList(objects)); + } + + /** + * Create a new string thats a n duplicate copies of s + * @param s the string to duplicate + * @param nCopies how many copies? + * @return a string + */ + public static String dupString(final String s, int nCopies) { + if ( s == null || s.equals("") ) throw new IllegalArgumentException("Bad s " + s); + if ( nCopies < 0 ) throw new IllegalArgumentException("nCopies must be >= 0 but got " + nCopies); + + final StringBuilder b = new StringBuilder(); + for ( int i = 0; i < nCopies; i++ ) + b.append(s); + return b.toString(); + } + + public static String dupString(char c, int nCopies) { + char[] chars = new char[nCopies]; + Arrays.fill(chars, c); + return new String(chars); + } + + public static byte[] dupBytes(byte b, int nCopies) { + byte[] bytes = new byte[nCopies]; + Arrays.fill(bytes, b); + return bytes; + } + + // trim a string for the given character (i.e. not just whitespace) + public static String trim(String str, char ch) { + char[] array = str.toCharArray(); + + + int start = 0; + while ( start < array.length && array[start] == ch ) + start++; + + int end = array.length - 1; + while ( end > start && array[end] == ch ) + end--; + + return str.substring(start, end+1); + } + + /** + * Splits expressions in command args by spaces and returns the array of expressions. + * Expressions may use single or double quotes to group any individual expression, but not both. + * @param args Arguments to parse. + * @return Parsed expressions. + */ + public static String[] escapeExpressions(String args) { + // special case for ' and " so we can allow expressions + if (args.indexOf('\'') != -1) + return escapeExpressions(args, "'"); + else if (args.indexOf('\"') != -1) + return escapeExpressions(args, "\""); + else + return args.trim().split(" +"); + } + + /** + * Splits expressions in command args by spaces and the supplied delimiter and returns the array of expressions. + * @param args Arguments to parse. + * @param delimiter Delimiter for grouping expressions. + * @return Parsed expressions. + */ + private static String[] escapeExpressions(String args, String delimiter) { + String[] command = {}; + String[] split = args.split(delimiter); + String arg; + for (int i = 0; i < split.length - 1; i += 2) { + arg = split[i].trim(); + if (arg.length() > 0) // if the unescaped arg has a size + command = Utils.concatArrays(command, arg.split(" +")); + command = Utils.concatArrays(command, new String[]{split[i + 1]}); + } + arg = split[split.length - 1].trim(); + if (split.length % 2 == 1) // if the command ends with a delimiter + if (arg.length() > 0) // if the last unescaped arg has a size + command = Utils.concatArrays(command, arg.split(" +")); + return command; + } + + /** + * Concatenates two String arrays. + * @param A First array. + * @param B Second array. + * @return Concatenation of A then B. + */ + public static String[] concatArrays(String[] A, String[] B) { + String[] C = new String[A.length + B.length]; + System.arraycopy(A, 0, C, 0, A.length); + System.arraycopy(B, 0, C, A.length, B.length); + return C; + } + + /** + * Concatenates byte arrays + * @return a concat of all bytes in allBytes in order + */ + public static byte[] concat(final byte[] ... allBytes) { + int size = 0; + for ( final byte[] bytes : allBytes ) size += bytes.length; + + final byte[] c = new byte[size]; + int offset = 0; + for ( final byte[] bytes : allBytes ) { + System.arraycopy(bytes, 0, c, offset, bytes.length); + offset += bytes.length; + } + + return c; + } + + /** + * Appends String(s) B to array A. + * @param A First array. + * @param B Strings to append. + * @return A with B(s) appended. + */ + public static String[] appendArray(String[] A, String... B) { + return concatArrays(A, B); + } + + public static > List sorted(Collection c) { + return sorted(c, false); + } + + public static > List sorted(Collection c, boolean reverse) { + List l = new ArrayList(c); + Collections.sort(l); + if ( reverse ) Collections.reverse(l); + return l; + } + + public static , V> List sorted(Map c) { + return sorted(c, false); + } + + public static , V> List sorted(Map c, boolean reverse) { + List t = new ArrayList(c.keySet()); + Collections.sort(t); + if ( reverse ) Collections.reverse(t); + + List l = new ArrayList(); + for ( T k : t ) { + l.add(c.get(k)); + } + return l; + } + + /** + * Reverse a byte array of bases + * + * @param bases the byte array of bases + * @return the reverse of the base byte array + */ + static public byte[] reverse(byte[] bases) { + byte[] rcbases = new byte[bases.length]; + + for (int i = 0; i < bases.length; i++) { + rcbases[i] = bases[bases.length - i - 1]; + } + + return rcbases; + } + + static public List reverse(final List l) { + final List newL = new ArrayList(l); + Collections.reverse(newL); + return newL; + } + + /** + * Reverse an int array of bases + * + * @param bases the int array of bases + * @return the reverse of the base int array + */ + static public int[] reverse(int[] bases) { + int[] rcbases = new int[bases.length]; + + for (int i = 0; i < bases.length; i++) { + rcbases[i] = bases[bases.length - i - 1]; + } + + return rcbases; + } + + /** + * Reverse (NOT reverse-complement!!) a string + * + * @param bases input string + * @return the reversed string + */ + static public String reverse(String bases) { + return new String( reverse( bases.getBytes() )) ; + } + + public static boolean isFlagSet(int value, int flag) { + return ((value & flag) == flag); + } + + /** + * Helper utility that calls into the InetAddress system to resolve the hostname. If this fails, + * unresolvable gets returned instead. + */ + public static String resolveHostname() { + try { + return InetAddress.getLocalHost().getCanonicalHostName(); + } + catch (java.net.UnknownHostException uhe) { // [beware typo in code sample -dmw] + return "unresolvable"; + // handle exception + } + } + + + public static byte [] arrayFromArrayWithLength(byte[] array, int length) { + byte [] output = new byte[length]; + for (int j = 0; j < length; j++) + output[j] = array[(j % array.length)]; + return output; + } + + public static void fillArrayWithByte(byte[] array, byte value) { + for (int i=0; i int nCombinations(final Collection[] options) { + int nStates = 1; + for ( Collection states : options ) { + nStates *= states.size(); + } + return nStates; + } + + @Requires("options != null") + public static int nCombinations(final List> options) { + if ( options.isEmpty() ) + return 0; + else { + int nStates = 1; + for ( Collection states : options ) { + nStates *= states.size(); + } + return nStates; + } + } + + /** + * Make all combinations of N size of objects + * + * if objects = [A, B, C] + * if N = 1 => [[A], [B], [C]] + * if N = 2 => [[A, A], [B, A], [C, A], [A, B], [B, B], [C, B], [A, C], [B, C], [C, C]] + * + * @param objects list of objects + * @param n size of each combination + * @param withReplacement if false, the resulting permutations will only contain unique objects from objects + * @return a list with all combinations with size n of objects. + */ + public static List> makePermutations(final List objects, final int n, final boolean withReplacement) { + final List> combinations = new ArrayList>(); + + if ( n == 1 ) { + for ( final T o : objects ) + combinations.add(Collections.singletonList(o)); + } else if (n > 1) { + final List> sub = makePermutations(objects, n - 1, withReplacement); + for ( List subI : sub ) { + for ( final T a : objects ) { + if ( withReplacement || ! subI.contains(a) ) + combinations.add(Utils.cons(a, subI)); + } + } + } + + return combinations; + } + + /** + * Convenience function that formats the novelty rate as a %.2f string + * + * @param known number of variants from all that are known + * @param all number of all variants + * @return a String novelty rate, or NA if all == 0 + */ + public static String formattedNoveltyRate(final int known, final int all) { + return formattedPercent(all - known, all); + } + + /** + * Convenience function that formats the novelty rate as a %.2f string + * + * @param x number of objects part of total that meet some criteria + * @param total count of all objects, including x + * @return a String percent rate, or NA if total == 0 + */ + public static String formattedPercent(final long x, final long total) { + return total == 0 ? "NA" : String.format("%.2f", (100.0*x) / total); + } + + /** + * Convenience function that formats a ratio as a %.2f string + * + * @param num number of observations in the numerator + * @param denom number of observations in the denumerator + * @return a String formatted ratio, or NA if all == 0 + */ + public static String formattedRatio(final long num, final long denom) { + return denom == 0 ? "NA" : String.format("%.2f", num / (1.0 * denom)); + } + + /** + * Adds element from an array into a collection. + * + * In the event of exception being throw due to some element, dest might have been modified by + * the successful addition of element before that one. + * + * @param dest the destination collection which cannot be null and should be able to accept + * the input elements. + * @param elements the element to add to dest + * @param collection type element. + * @throws UnsupportedOperationException if the add operation + * is not supported by dest. + * @throws ClassCastException if the class of any of the elements + * prevents it from being added to dest. + * @throws NullPointerException if any of the elements is null and dest + * does not permit null elements + * @throws IllegalArgumentException if some property of any of the elements + * prevents it from being added to this collection + * @throws IllegalStateException if any of the elements cannot be added at this + * time due to insertion restrictions. + * @return true if the collection was modified as a result. + */ + public static boolean addAll(Collection dest, T ... elements) { + boolean result = false; + for (final T e : elements) { + result = dest.add(e) | result; + } + return result; + } + + /** + * Create a constant map that maps each value in values to itself + */ + public static Map makeIdentityFunctionMap(Collection values) { + Map map = new HashMap(values.size()); + for ( final T value : values ) + map.put(value, value); + return Collections.unmodifiableMap(map); + } + + /** + * Divides the input list into a list of sublists, which contains group size elements (except potentially the last one) + * + * list = [A, B, C, D, E] + * groupSize = 2 + * result = [[A, B], [C, D], [E]] + * + */ + public static List> groupList(final List list, final int groupSize) { + if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1"); + + final List> subLists = new LinkedList>(); + int n = list.size(); + for ( int i = 0; i < n; i += groupSize ) { + subLists.add(list.subList(i, Math.min(i + groupSize, n))); + } + return subLists; + } + + /** + * @see #calcMD5(byte[]) + */ + public static String calcMD5(final String s) { + return calcMD5(s.getBytes()); + } + + /** + * Calculate the md5 for bytes, and return the result as a 32 character string + * + * @param bytes the bytes to calculate the md5 of + * @return the md5 of bytes, as a 32-character long string + */ + @Ensures({"result != null", "result.length() == 32"}) + public static String calcMD5(final byte[] bytes) { + if ( bytes == null ) throw new IllegalArgumentException("bytes cannot be null"); + try { + final byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytes); + final BigInteger bigInt = new BigInteger(1, thedigest); + + String md5String = bigInt.toString(16); + while (md5String.length() < 32) md5String = "0" + md5String; // pad to length 32 + return md5String; + } + catch ( NoSuchAlgorithmException e ) { + throw new IllegalStateException("MD5 digest algorithm not present"); + } + } + + /** + * Does big end with the exact sequence of bytes in suffix? + * + * @param big a non-null byte[] to test if it a prefix + suffix + * @param suffix a non-null byte[] to test if it's a suffix of big + * @return true if big is proper byte[] composed of some prefix + suffix + */ + public static boolean endsWith(final byte[] big, final byte[] suffix) { + if ( big == null ) throw new IllegalArgumentException("big cannot be null"); + if ( suffix == null ) throw new IllegalArgumentException("suffix cannot be null"); + return new String(big).endsWith(new String(suffix)); + } + + /** + * Get the length of the longest common prefix of seq1 and seq2 + * @param seq1 non-null byte array + * @param seq2 non-null byte array + * @param maxLength the maximum allowed length to return + * @return the length of the longest common prefix of seq1 and seq2, >= 0 + */ + public static int longestCommonPrefix(final byte[] seq1, final byte[] seq2, final int maxLength) { + if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null"); + if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null"); + if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength); + + final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength)); + for ( int i = 0; i < end; i++ ) { + if ( seq1[i] != seq2[i] ) + return i; + } + return end; + } + + /** + * Get the length of the longest common suffix of seq1 and seq2 + * @param seq1 non-null byte array + * @param seq2 non-null byte array + * @param maxLength the maximum allowed length to return + * @return the length of the longest common suffix of seq1 and seq2, >= 0 + */ + public static int longestCommonSuffix(final byte[] seq1, final byte[] seq2, final int maxLength) { + if ( seq1 == null ) throw new IllegalArgumentException("seq1 is null"); + if ( seq2 == null ) throw new IllegalArgumentException("seq2 is null"); + if ( maxLength < 0 ) throw new IllegalArgumentException("maxLength < 0 " + maxLength); + + final int end = Math.min(seq1.length, Math.min(seq2.length, maxLength)); + for ( int i = 0; i < end; i++ ) { + if ( seq1[seq1.length - i - 1] != seq2[seq2.length - i - 1] ) + return i; + } + return end; + } + + /** + * Trim any number of bases from the front and/or back of an array + * + * @param seq the sequence to trim + * @param trimFromFront how much to trim from the front + * @param trimFromBack how much to trim from the back + * @return a non-null array; can be the original array (i.e. not a copy) + */ + public static byte[] trimArray(final byte[] seq, final int trimFromFront, final int trimFromBack) { + if ( trimFromFront + trimFromBack > seq.length ) + throw new IllegalArgumentException("trimming total is larger than the original array"); + + // don't perform array copies if we need to copy everything anyways + return ( trimFromFront == 0 && trimFromBack == 0 ) ? seq : Arrays.copyOfRange(seq, trimFromFront, seq.length - trimFromBack); + } + + /** + * Simple wrapper for sticking elements of a int[] array into a List + * @param ar - the array whose elements should be listified + * @return - a List where each element has the same value as the corresponding index in @ar + */ + public static List listFromPrimitives(final int[] ar) { + final ArrayList lst = new ArrayList<>(ar.length); + for ( final int d : ar ) { + lst.add(d); + } + + return lst; + } + + /** + * Compares sections from to byte arrays to verify whether they contain the same values. + * + * @param left first array to compare. + * @param leftOffset first position of the first array to compare. + * @param right second array to compare. + * @param rightOffset first position of the second array to compare. + * @param length number of positions to compare. + * + * @throws IllegalArgumentException if

    + *
  • either {@code left} or {@code right} is {@code null} or
  • + *
  • any off the offset or length combine point outside any of the two arrays
  • + *
+ * @return {@code true} iff {@code length} is 0 or all the bytes in both ranges are the same two-by-two. + */ + public static boolean equalRange(final byte[] left, final int leftOffset, byte[] right, final int rightOffset, final int length) { + if (left == null) throw new IllegalArgumentException("left cannot be null"); + if (right == null) throw new IllegalArgumentException("right cannot be null"); + if (length < 0) throw new IllegalArgumentException("the length cannot be negative"); + if (leftOffset < 0) throw new IllegalArgumentException("left offset cannot be negative"); + if (leftOffset + length > left.length) throw new IllegalArgumentException("length goes beyond end of left array"); + if (rightOffset < 0) throw new IllegalArgumentException("right offset cannot be negative"); + if (rightOffset + length > right.length) throw new IllegalArgumentException("length goes beyond end of right array"); + + for (int i = 0; i < length; i++) + if (left[leftOffset + i] != right[rightOffset + i]) + return false; + return true; + } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * @param original the original array to splice. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T original, final boolean[] remove) { + return skimArray(original,0,null,0,remove,0); + } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * + *

+ * If the {@code dest} array provide is not long enough a new one will be created and returned with the + * same component type. All elements before {@code destOffset} will be copied from the input to the + * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where + * the position preceding {@code destOffset} will be left with the default value. The component type + * Will match the one of the {@code source} array. + *

+ * + * @param source the original array to splice. + * @param sourceOffset the first position to skim. + * @param dest the destination array. + * @param destOffset the first position where to copy the skimed array values. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * @param removeOffset the first position in the remove index array to consider. + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, final boolean[] remove, final int removeOffset) { + if (source == null) + throw new IllegalArgumentException("the source array cannot be null"); + @SuppressWarnings("unchecked") + final Class sourceClazz = (Class) source.getClass(); + + if (!sourceClazz.isArray()) + throw new IllegalArgumentException("the source array is not in fact an array instance"); + final int length = Array.getLength(source) - sourceOffset; + if (length < 0) + throw new IllegalArgumentException("the source offset goes beyond the source array length"); + return skimArray(source,sourceOffset,dest,destOffset,remove,removeOffset,length); + } + + /** + * Skims out positions of an array returning a shorter one with the remaning positions in the same order. + * + *

+ * If the {@code dest} array provide is not long enough a new one will be created and returned with the + * same component type. All elements before {@code destOffset} will be copied from the input to the + * result array. If {@code dest} is {@code null}, a brand-new array large enough will be created where + * the position preceding {@code destOffset} will be left with the default value. The component type + * Will match the one of the {@code source} array. + *

+ * + * @param source the original array to splice. + * @param sourceOffset the first position to skim. + * @param dest the destination array. + * @param destOffset the first position where to copy the skimed array values. + * @param remove for each position in {@code original} indicates whether it should be spliced away ({@code true}), + * or retained ({@code false}) + * @param removeOffset the first position in the remove index array to consider. + * @param length the total number of position in {@code source} to consider. Thus only the {@code sourceOffset} to + * {@code sourceOffset + length - 1} region will be skimmed. + * + * @param the array type. + * + * @throws IllegalArgumentException if either {@code original} or {@code remove} is {@code null}, + * or {@code remove length is different to {@code original}'s}, or {@code original} is not in + * fact an array. + * + * @return never {@code null}. + */ + public static T skimArray(final T source, final int sourceOffset, final T dest, final int destOffset, + final boolean[] remove, final int removeOffset, final int length) { + if (source == null) + throw new IllegalArgumentException("the source array cannot be null"); + if (remove == null) + throw new IllegalArgumentException("the remove array cannot be null"); + if (sourceOffset < 0) + throw new IllegalArgumentException("the source array offset cannot be negative"); + if (destOffset < 0) + throw new IllegalArgumentException("the destination array offset cannot be negative"); + if (removeOffset < 0) + throw new IllegalArgumentException("the remove array offset cannot be negative"); + if (length < 0) + throw new IllegalArgumentException("the length provided cannot be negative"); + + final int removeLength = Math.min(remove.length - removeOffset,length); + + if (removeLength < 0) + throw new IllegalArgumentException("the remove offset provided falls beyond the remove array end"); + + + @SuppressWarnings("unchecked") + final Class sourceClazz = (Class) source.getClass(); + + if (!sourceClazz.isArray()) + throw new IllegalArgumentException("the source array is not in fact an array instance"); + + final Class destClazz = skimArrayDetermineDestArrayClass(dest, sourceClazz); + + final int sourceLength = Array.getLength(source); + + if (sourceLength < length + sourceOffset) + throw new IllegalArgumentException("the source array is too small considering length and offset"); + + // count how many positions are to be removed. + + int removeCount = 0; + + final int removeEnd = removeLength + removeOffset; + for (int i = removeOffset; i < removeEnd; i++) + if (remove[i]) removeCount++; + + + final int newLength = length - removeCount; + + + @SuppressWarnings("unchecked") + final T result = skimArrayBuildResultArray(dest, destOffset, destClazz, newLength); + // No removals, just copy the whole thing. + + if (removeCount == 0) + System.arraycopy(source,sourceOffset,result,destOffset,length); + else if (length > 0) { // if length == 0 nothing to do. + int nextOriginalIndex = 0; + int nextNewIndex = 0; + int nextRemoveIndex = removeOffset; + while (nextOriginalIndex < length && nextNewIndex < newLength) { + while (nextRemoveIndex < removeEnd && remove[nextRemoveIndex++]) { nextOriginalIndex++; } // skip positions to be spliced. + // Since we make the nextNewIndex < newLength check in the while condition + // there is no need to include the following break, as is guaranteed not to be true: + // if (nextOriginalIndex >= length) break; // we reach the final (last positions are to be spliced. + final int copyStart = nextOriginalIndex; + while (++nextOriginalIndex < length && (nextRemoveIndex >= removeEnd || !remove[nextRemoveIndex])) { nextRemoveIndex++; } + final int copyEnd = nextOriginalIndex; + final int copyLength = copyEnd - copyStart; + System.arraycopy(source, sourceOffset + copyStart, result, destOffset + nextNewIndex, copyLength); + nextNewIndex += copyLength; + } + } + return result; + } + + private static T skimArrayBuildResultArray(final T dest, final int destOffset, final Class destClazz, final int newLength) { + @SuppressWarnings("unchecked") + final T result; + + if (dest == null) + result = (T) Array.newInstance(destClazz.getComponentType(), newLength + destOffset); + else if (Array.getLength(dest) < newLength + destOffset) { + result = (T) Array.newInstance(destClazz.getComponentType(),newLength + destOffset); + if (destOffset > 0) System.arraycopy(dest,0,result,0,destOffset); + } else + result = dest; + return result; + } + + private static Class skimArrayDetermineDestArrayClass(final T dest, Class sourceClazz) { + final Class destClazz; + if (dest == null) + destClazz = sourceClazz; + else { + destClazz = (Class) dest.getClass(); + if (destClazz != sourceClazz) { + if (!destClazz.isArray()) + throw new IllegalArgumentException("the destination array class must be an array"); + if (sourceClazz.getComponentType().isAssignableFrom(destClazz.getComponentType())) + throw new IllegalArgumentException("the provided destination array class cannot contain values from the source due to type incompatibility"); + } + } + return destClazz; + } + + /** + * Makes a deep clone of the array provided. + * + *

+ * When you can use {@link Arrays#copyOf} or an array {@link Object#clone()} to create a copy of itself, + * if it is multi-dimentional each sub array or matrix would be cloned. + *

+ * + *

+ * Notice however that if the base type is an Object type, the base elements themselves wont be cloned. + *

+ * + * @param array the array to deep-clone. + * @param type of the array. + * + * @throws IllegalArgumentException if {@code array} is {@code null} or is not an array. + */ + public static T deepCloneArray(final T array) { + + if (array == null) + throw new IllegalArgumentException(""); + + @SuppressWarnings("unchecked") + final Class clazz = (Class) array.getClass(); + + + if (!clazz.isArray()) + throw new IllegalArgumentException("the input is not an array"); + + final int dimension = calculateArrayDimensions(clazz); + + return deepCloneArrayUnchecked(array,clazz, dimension); + } + + private static int calculateArrayDimensions(final Class clazz) { + if (clazz.isArray()) + return calculateArrayDimensions(clazz.getComponentType()) + 1; + else + return 0; + } + + private static T deepCloneArrayUnchecked(final T array, final Class clazz, final int dimension) { + + + final int length = Array.getLength(array); + + final Class componentClass = clazz.getComponentType(); + + final T result = (T) Array.newInstance(componentClass,length); + + if (dimension <= 1) { + System.arraycopy(array, 0, result, 0, length); + return result; + } + + + final int dimensionMinus1 = dimension - 1; + + for (int i = 0; i < length; i++) + Array.set(result,i,deepCloneArrayUnchecked(Array.get(array,i),componentClass,dimensionMinus1)); + + return result; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java new file mode 100644 index 000000000..0dfce0e99 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/ValidationExclusion.java @@ -0,0 +1,71 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import org.broadinstitute.gatk.utils.commandline.EnumerationArgumentDefault; + +import java.util.ArrayList; +import java.util.List; + + +public class ValidationExclusion { + // our validation options + + public enum TYPE { + ALLOW_N_CIGAR_READS, // ignore the presence of N operators in CIGARs: do not blow up and process reads that contain one or more N operators. + // This exclusion does not have effect on reads that get filtered {@see MalformedReadFilter}. + ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard + ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set + NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file + ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities + LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc. + @EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL + ALL // do not check for all of the above conditions, DEFAULT + } + + // a storage for the passed in exclusions + List exclusions = new ArrayList(); + + public ValidationExclusion(List exclusionsList) { + exclusions.addAll(exclusionsList); + } + + public ValidationExclusion() {} + + /** + * do we contain the exclusion specified, or were we set to ALL + * @param t the exclusion case to test for + * @return true if we contain the exclusion or if we're set to ALL, false otherwise + */ + public boolean contains(TYPE t) { + return (exclusions.contains(TYPE.ALL) || exclusions.contains(t)); + } + + public static boolean lenientVCFProcessing(final TYPE val) { + return val == TYPE.ALL + || val == TYPE.LENIENT_VCF_PROCESSING; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegion.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionReadState.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionReadState.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionReadState.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionReadState.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfile.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfile.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfile.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfile.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileState.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileState.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileState.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileState.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfile.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcid.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcid.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcid.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcid.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidTable.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidTable.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidTable.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidTable.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/analysis/AminoAcidUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/baq/BAQ.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/JVMUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/JVMUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/JVMUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/JVMUtils.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/PluginManager.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/PluginManager.java new file mode 100644 index 000000000..4b69968e3 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/PluginManager.java @@ -0,0 +1,356 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.classloader; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.reflections.Reflections; +import org.reflections.scanners.SubTypesScanner; +import org.reflections.util.ConfigurationBuilder; + +import java.io.File; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLClassLoader; +import java.util.*; + +/** + * Manage plugins and plugin configuration. + * @author mhanna + * @version 0.1 + */ +public class PluginManager { + /** + * A reference into our introspection utility. + */ + private static final Reflections defaultReflections; + + static { + // turn off logging in the reflections library - they talk too much + Reflections.log = null; + + Set classPathUrls = new LinkedHashSet(); + + URL cwd; + try { + cwd = new File(".").getAbsoluteFile().toURI().toURL(); + } catch (MalformedURLException e) { + throw new RuntimeException(e); + } + + // NOTE: Reflections also scans directories for classes. + // Meanwhile some of the jar MANIFEST.MF Bundle-ClassPath properties contain "." + // Do NOT let reflections scan the CWD where it often picks up test classes when + // they weren't explicitly in the classpath, for example the UninstantiableWalker + for (URL url: JVMUtils.getClasspathURLs()) + if (!url.equals(cwd)) + classPathUrls.add(url); + + defaultReflections = new Reflections( new ConfigurationBuilder() + .setUrls(classPathUrls) + .setScanners(new SubTypesScanner())); + } + + /** + * Defines the category of plugin defined by the subclass. + */ + protected final String pluginCategory; + + /** + * Define common strings to trim off the end of the name. + */ + protected final String pluginSuffix; + + /** + * Plugins stored based on their name. + */ + private final SortedMap> pluginsByName; + + private final List> plugins; + private final List> interfaces; + + /** + * Create a new plugin manager. + * @param pluginType Core type for a plugin. + */ + public PluginManager(Class pluginType) { + this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), null); + } + + /** + * Create a new plugin manager. + * @param pluginType Core type for a plugin. + * @param classpath Custom class path to search for classes. + */ + public PluginManager(Class pluginType, List classpath) { + this(pluginType, pluginType.getSimpleName().toLowerCase(), pluginType.getSimpleName(), classpath); + } + + /** + * Create a new plugin manager. + * @param pluginType Core type for a plugin. + * @param pluginCategory Provides a category name to the plugin. Must not be null. + * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. + */ + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix) { + this(pluginType, pluginCategory, pluginSuffix, null); + } + + /** + * Create a new plugin manager. + * @param pluginType Core type for a plugin. + * @param pluginCategory Provides a category name to the plugin. Must not be null. + * @param pluginSuffix Provides a suffix that will be trimmed off when converting to a plugin name. Can be null. + * @param classpath Custom class path to search for classes. + */ + public PluginManager(Class pluginType, String pluginCategory, String pluginSuffix, List classpath) { + this.pluginCategory = pluginCategory; + this.pluginSuffix = pluginSuffix; + + this.plugins = new ArrayList>(); + this.interfaces = new ArrayList>(); + + Reflections reflections; + if (classpath == null) { + reflections = defaultReflections; + } else { + addClasspath(classpath); + reflections = new Reflections( new ConfigurationBuilder() + .setUrls(classpath) + .setScanners(new SubTypesScanner())); + } + + // Load all classes types filtering them by concrete. + @SuppressWarnings("unchecked") + Set> allTypes = reflections.getSubTypesOf(pluginType); + for( Class type: allTypes ) { + // The plugin manager does not support anonymous classes; to be a plugin, a class must have a name. + if(JVMUtils.isAnonymous(type)) + continue; + + if( JVMUtils.isConcrete(type) ) + plugins.add(type); + else + interfaces.add(type); + } + + pluginsByName = new TreeMap>(); + for (Class pluginClass : plugins) { + String pluginName = getName(pluginClass); + pluginsByName.put(pluginName, pluginClass); + } + + // sort the plugins so the order of elements is deterministic + sortPlugins(plugins); + sortPlugins(interfaces); + } + + /** + * Sorts, in place, the list of plugins according to getName() on each element + * + * @param unsortedPlugins unsorted plugins + */ + private void sortPlugins(final List> unsortedPlugins) { + Collections.sort(unsortedPlugins, new ComparePluginsByName()); + } + + private final class ComparePluginsByName implements Comparator> { + @Override + public int compare(final Class aClass, final Class aClass1) { + String pluginName1 = getName(aClass); + String pluginName2 = getName(aClass1); + return pluginName1.compareTo(pluginName2); + } + } + + /** + * Adds the URL to the system class loader classpath using reflection. + * HACK: Uses reflection to modify the class path, and assumes loader is a URLClassLoader. + * @param urls URLs to add to the system class loader classpath. + */ + private static void addClasspath(List urls) { + Set existing = JVMUtils.getClasspathURLs(); + for (URL url : urls) { + if (existing.contains(url)) + continue; + try { + Method method = URLClassLoader.class.getDeclaredMethod("addURL", URL.class); + if (!method.isAccessible()) + method.setAccessible(true); + method.invoke(ClassLoader.getSystemClassLoader(), url); + } catch (Exception e) { + throw new ReviewedGATKException("Error adding url to the current classloader.", e); + } + } + } + + public Map> getPluginsByName() { + return Collections.unmodifiableMap(pluginsByName); + } + + /** + * Does a plugin with the given name exist? + * + * @param pluginName Name of the plugin for which to search. + * @return True if the plugin exists, false otherwise. + */ + public boolean exists(String pluginName) { + return pluginsByName.containsKey(pluginName); + } + + /** + * Does a plugin with the given name exist? + * + * @param plugin Name of the plugin for which to search. + * @return True if the plugin exists, false otherwise. + */ + public boolean exists(Class plugin) { + return pluginsByName.containsValue(plugin); + } + + /** + * Returns the plugin classes + * @return the plugin classes + */ + public List> getPlugins() { + return plugins; + } + + /** + * Returns the interface classes + * @return the interface classes + */ + public List> getInterfaces() { + return interfaces; + } + + /** + * Returns the plugin classes implementing interface or base clase + * @param type type of interface or base class + * @return the plugin classes implementing interface or base class + */ + public List> getPluginsImplementing(Class type) { + List> implementing = new ArrayList>(); + for (Class plugin: getPlugins()) + if (type.isAssignableFrom(plugin)) + implementing.add(plugin); + return implementing; + } + + + + /** + * Gets a plugin with the given name + * + * @param pluginName Name of the plugin to retrieve. + * @return The plugin object if found; null otherwise. + */ + public PluginType createByName(String pluginName) { + Class plugin = pluginsByName.get(pluginName); + if( plugin == null ) { + String errorMessage = formatErrorMessage(pluginCategory,pluginName); + throw createMalformedArgumentException(errorMessage); + } + try { + return plugin.newInstance(); + } catch (Exception e) { + throw new DynamicClassResolutionException(plugin, e); + } + } + + /** + * create a plugin with the given type + * + * @param pluginType type of the plugin to create. + * @return The plugin object if created; null otherwise. + */ + public PluginType createByType(Class pluginType) { + Logger logger = Logger.getLogger(PluginManager.class); + logger.setLevel(Level.ERROR); + try { + Constructor noArgsConstructor = pluginType.getDeclaredConstructor((Class[])null); + noArgsConstructor.setAccessible(true); + return noArgsConstructor.newInstance(); + } catch (Exception e) { + logger.error("Couldn't initialize the plugin. Typically this is because of wrong global class variable initializations."); + throw new DynamicClassResolutionException(pluginType, e); + } + } + + /** + * Returns concrete instances of the plugins + * @return concrete instances of the plugins + */ + public List createAllTypes() { + List instances = new ArrayList(); + for ( Class c : getPlugins() ) { + instances.add(createByType(c)); + } + return instances; + } + + /** + * Create a name for this type of plugin. + * + * @param pluginType The type of plugin. + * @return A name for this type of plugin. + */ + public String getName(Class pluginType) { + String pluginName = ""; + + if (pluginName.length() == 0) { + pluginName = pluginType.getSimpleName(); + if (pluginSuffix != null && pluginName.endsWith(pluginSuffix)) + pluginName = pluginName.substring(0, pluginName.lastIndexOf(pluginSuffix)); + } + + return pluginName; + } + + /** + * Generate the error message for the plugin manager. The message is allowed to depend on the class. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return error message text describing the error + */ + protected String formatErrorMessage(String pluginCategory, String pluginName ) { + return String.format("Could not find %s with name: %s", pluginCategory,pluginName); + } + + /** + * Creates a UserException with the appropriate message for this instance. + * @param errorMessage formatted error message from formatErrorMessage(). + * @return A UserException with the error message. + */ + protected UserException createMalformedArgumentException(final String errorMessage) { + throw new UserException.CommandLineException(errorMessage); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/ProtectedPackageSource.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/ProtectedPackageSource.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/ProtectedPackageSource.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/ProtectedPackageSource.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/PublicPackageSource.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/PublicPackageSource.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/classloader/PublicPackageSource.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/classloader/PublicPackageSource.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingOp.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingRepresentation.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingRepresentation.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingRepresentation.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/clipping/ClippingRepresentation.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ReadClipper.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/clipping/ReadClipper.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/clipping/ReadClipper.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/clipping/ReadClipper.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleCodec.java new file mode 100644 index 000000000..a8ac99def --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleCodec.java @@ -0,0 +1,276 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.codecs.beagle; +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + +import htsjdk.tribble.AsciiFeatureCodec; +import htsjdk.tribble.exception.CodecLineParsingException; +import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.gatk.utils.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.io.IOException; +import java.util.*; +import java.util.regex.Pattern; + +/** + * Codec for Beagle imputation engine + * + *

+ * Reads in tabular files with site markers and genotype posteriors, genotypes and phasing that Beagle produced + *

+ * + *

+ * See also: @see BEAGLE home page
+ *

+ + *

+ * + *

File format example for phased genotypes file

+ *
+ *     dummy header
+ *      20:60251 T T T T T T
+ *      20:60321 G G G G G G
+ *      20:60467 G G G G G G
+ * 
+ * + *

File format example for genotype posteriors

+ *
+ *     marker alleleA alleleB NA07056 NA07056 NA07056
+ *     20:60251 T C 0.9962 0.0038 0 0.99245 0.00755 0 0.99245 0.00755 0
+ *     20:60321 G T 0.98747 0.01253 0 0.99922 0.00078 0 0.99368 0.00632 0
+ *     20:60467 G C 0.97475 0.02525 0 0.98718 0.01282 0 0.98718 0.01282 0
+ * 
+ * + *

File format example for r2 file + *
+ *      20:60251        0.747
+ *      20:60321        0.763
+ *      20:60467        0.524
+ * 
+ *

+ * @author Mark DePristo + * @since 2010 + */ +public class BeagleCodec extends AsciiFeatureCodec implements ReferenceDependentFeatureCodec { + private String[] header; + public enum BeagleReaderType {PROBLIKELIHOOD, GENOTYPES, R2}; + private BeagleReaderType readerType; + private int valuesPerSample; + private int initialSampleIndex; + private int markerPosition; + private ArrayList sampleNames; + private int expectedTokensPerLine; + private final static Set HEADER_IDs = new HashSet(Arrays.asList("marker", "I")); + + private static final String delimiterRegex = "\\s+"; + + /** + * The parser to use when resolving genome-wide locations. + */ + private GenomeLocParser genomeLocParser; + + public BeagleCodec() { + super(BeagleFeature.class); + } + + /** + * Set the parser to use when resolving genetic data. + * @param genomeLocParser The supplied parser. + */ + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + @Override + public Object readActualHeader(LineIterator reader) { + int[] lineCounter = new int[1]; + try { + header = readHeader(reader, lineCounter); + + Boolean getSamples = true; + markerPosition = 0; //default value for all readers + + if (header[0].matches("I")) { + // Phased genotype Beagle files start with "I" + readerType = BeagleReaderType.GENOTYPES; + valuesPerSample = 2; + initialSampleIndex = 2; + markerPosition = 1; + } + else if (header[0].matches("marker")) { + readerType = BeagleReaderType.PROBLIKELIHOOD; + valuesPerSample = 3; + initialSampleIndex = 3; + } + else { + readerType = BeagleReaderType.R2; + getSamples = false; + // signal we don't have a header + lineCounter[0] = 0; + // not needed, but for consistency: + valuesPerSample = 0; + initialSampleIndex = 0; + } + + sampleNames = new ArrayList(); + + if (getSamples) { + for (int k = initialSampleIndex; k < header.length; k += valuesPerSample) + sampleNames.add(header[k]); + + expectedTokensPerLine = sampleNames.size()*valuesPerSample+initialSampleIndex; + + } else { + expectedTokensPerLine = 2; + } + + + } catch(IOException e) { + throw new IllegalArgumentException("Unable to read from file.", e); + } + return header; + } + + private static String[] readHeader(final LineIterator source, int[] lineCounter) throws IOException { + + String[] header = null; + int numLines = 0; + + //find the 1st line that's non-empty and not a comment + while(source.hasNext()) { + final String line = source.next(); + numLines++; + if ( line.trim().isEmpty() ) { + continue; + } + + //parse the header + header = line.split(delimiterRegex); + break; + } + + // check that we found the header + if ( header == null ) { + throw new IllegalArgumentException("No header in " + source); + } + + if(lineCounter != null) { + lineCounter[0] = numLines; + } + + return header; + } + + private static Pattern MARKER_PATTERN = Pattern.compile("(.+):([0-9]+)"); + + public BeagleFeature decode(String line) { + String[] tokens; + + // split the line + tokens = line.split(delimiterRegex); + if (tokens.length != expectedTokensPerLine) + throw new CodecLineParsingException("Incorrect number of fields in Beagle input on line "+line); + + if ( HEADER_IDs.contains(tokens[0]) ) + return null; + + BeagleFeature bglFeature = new BeagleFeature(); + + final GenomeLoc loc = genomeLocParser.parseGenomeLoc(tokens[markerPosition]); //GenomeLocParser.parseGenomeLoc(values.get(0)); - TODO switch to this + + //parse the location: common to all readers + bglFeature.setChr(loc.getContig()); + bglFeature.setStart((int) loc.getStart()); + bglFeature.setEnd((int) loc.getStop()); + + // Parse R2 if needed + if (readerType == BeagleReaderType.R2) { + bglFeature.setR2value(Double.valueOf(tokens[1])); + } + else if (readerType == BeagleReaderType.GENOTYPES) { + // read phased Genotype pairs + HashMap> sampleGenotypes = new HashMap>(); + + for ( int i = 2; i < tokens.length; i+=2 ) { + String sampleName = sampleNames.get(i/2-1); + if ( ! sampleGenotypes.containsKey(sampleName) ) { + sampleGenotypes.put(sampleName, new ArrayList()); + } + + sampleGenotypes.get(sampleName).add(tokens[i]); + sampleGenotypes.get(sampleName).add(tokens[i+1]); + } + + bglFeature.setGenotypes(sampleGenotypes); + } + else { + // read probabilities/likelihood trios and alleles + bglFeature.setAlleleA(tokens[1], true); + bglFeature.setAlleleB(tokens[2], false); + HashMap> sampleProbLikelihoods = new HashMap>(); + + for ( int i = 3; i < tokens.length; i+=3 ) { + String sampleName = sampleNames.get(i/3-1); + if ( ! sampleProbLikelihoods.containsKey(sampleName) ) { + sampleProbLikelihoods.put(sampleName, new ArrayList()); + } + + sampleProbLikelihoods.get(sampleName).add(tokens[i]); + sampleProbLikelihoods.get(sampleName).add(tokens[i+1]); + sampleProbLikelihoods.get(sampleName).add(tokens[i+2]); + } + bglFeature.setProbLikelihoods(sampleProbLikelihoods); + } + + return bglFeature; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleFeature.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleFeature.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/beagle/BeagleFeature.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapCodec.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapCodec.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapCodec.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapFeature.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapFeature.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/hapmap/RawHapMapFeature.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqCodec.java new file mode 100644 index 000000000..379fba036 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqCodec.java @@ -0,0 +1,171 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.codecs.refseq; + +import htsjdk.tribble.AsciiFeatureCodec; +import htsjdk.tribble.Feature; +import htsjdk.tribble.TribbleException; +import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.gatk.utils.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.util.ArrayList; + +/** + * Allows for reading in RefSeq information + * + *

+ * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop, + * strandedness of transcription. + *

+ * + *

+ * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the documentation guide here + * http://www.broadinstitute.org/gatk/guide/article?id=1329 + *

+ *

Usage

+ * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example + *
+ * -refSeqBinding:REFSEQ /path/to/refSeq.txt
+ * 
+ * + * You will need to consult individual walkers for the binding name ("refSeqBinding", above) + * + *

File format example

+ * If you want to define your own file for use, the format is (tab delimited): + * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete) + * and exon frames, for example: + *
+ * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
+ * 
+ * for more information see here + *

+ * + *

+ * + * @author Mark DePristo + * @since 2010 + */ +public class RefSeqCodec extends AsciiFeatureCodec implements ReferenceDependentFeatureCodec { + + /** + * The parser to use when resolving genome-wide locations. + */ + private GenomeLocParser genomeLocParser; + private boolean zero_coding_length_user_warned = false; + + public RefSeqCodec() { + super(RefSeqFeature.class); + } + + /** + * Set the parser to use when resolving genetic data. + * @param genomeLocParser The supplied parser. + */ + @Override + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + @Override + public Feature decodeLoc(final LineIterator lineIterator) { + final String line = lineIterator.next(); + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + if (fields.length < 3) throw new TribbleException("RefSeq (decodeLoc) : Unable to parse line -> " + line + ", we expected at least 3 columns, we saw " + fields.length); + String contig_name = fields[2]; + try { + return new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + } catch ( UserException.MalformedGenomeLoc e ) { + Utils.warnUser("RefSeq file is potentially incorrect, as some transcripts or exons have a negative length ("+fields[2]+")"); + return null; + } catch ( NumberFormatException e ) { + throw new UserException.MalformedFile("Could not parse location from line: " + line); + } + } + + /** Fills this object from a text line in RefSeq (UCSC) text dump file */ + @Override + public RefSeqFeature decode(String line) { + if (line.startsWith("#")) return null; + String fields[] = line.split("\t"); + + // we reference postion 15 in the split array below, make sure we have at least that many columns + if (fields.length < 16) throw new TribbleException("RefSeq (decode) : Unable to parse line -> " + line + ", we expected at least 16 columns, we saw " + fields.length); + String contig_name = fields[2]; + RefSeqFeature feature = new RefSeqFeature(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + + feature.setTranscript_id(fields[1]); + if ( fields[3].length()==1 && fields[3].charAt(0)=='+') feature.setStrand(1); + else if ( fields[3].length()==1 && fields[3].charAt(0)=='-') feature.setStrand(-1); + else throw new UserException.MalformedFile("Expected strand symbol (+/-), found: "+fields[3] + " for line=" + line); + + int coding_start = Integer.parseInt(fields[6])+1; + int coding_stop = Integer.parseInt(fields[7]); + + if ( coding_start > coding_stop ) { + if ( ! zero_coding_length_user_warned ) { + Utils.warnUser("RefSeq file contains transcripts with zero coding length. "+ + "Such transcripts will be ignored (this warning is printed only once)"); + zero_coding_length_user_warned = true; + } + return null; + } + + feature.setTranscript_interval(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(fields[4])+1, Integer.parseInt(fields[5]))); + feature.setTranscript_coding_interval(genomeLocParser.createGenomeLoc(contig_name, coding_start, coding_stop)); + feature.setGene_name(fields[12]); + String[] exon_starts = fields[9].split(","); + String[] exon_stops = fields[10].split(","); + String[] eframes = fields[15].split(","); + + if ( exon_starts.length != exon_stops.length ) + throw new UserException.MalformedFile("Data format error: numbers of exon start and stop positions differ for line=" + line); + if ( exon_starts.length != eframes.length ) + throw new UserException.MalformedFile("Data format error: numbers of exons and exon frameshifts differ for line=" + line); + + ArrayList exons = new ArrayList(exon_starts.length); + ArrayList exon_frames = new ArrayList(eframes.length); + + for ( int i = 0 ; i < exon_starts.length ; i++ ) { + exons.add(genomeLocParser.createGenomeLoc(contig_name, Integer.parseInt(exon_starts[i])+1, Integer.parseInt(exon_stops[i]) ) ); + exon_frames.add(Integer.decode(eframes[i])); + } + + feature.setExons(exons); + feature.setExon_frames(exon_frames); + return feature; + } + + @Override + public Object readActualHeader(LineIterator lineIterator) { + // No header for this format + return null; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqFeature.java new file mode 100644 index 000000000..97ff961d9 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/RefSeqFeature.java @@ -0,0 +1,323 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.codecs.refseq; + +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.*; + +/** + * the ref seq feature + */ +public class RefSeqFeature implements Transcript, Feature { + + private String transcript_id; + private int strand; + private GenomeLoc transcript_interval; + private GenomeLoc transcript_coding_interval; + private List exons; + private String gene_name; + private List exon_frames; + private String name; + + public RefSeqFeature(GenomeLoc genomeLoc) { + this.transcript_interval = genomeLoc; + } + + /** Returns id of the transcript (RefSeq NM_* id) */ + public String getTranscriptId() { return transcript_id; } + + /** Returns coding strand of the transcript, 1 or -1 for positive or negative strand, respectively */ + public int getStrand() { return strand; } + + /** Returns transcript's full genomic interval (includes all exons with UTRs) */ + public GenomeLoc getLocation() { + return transcript_interval; + } + + /** Returns genomic interval of the coding sequence (does not include UTRs, but still includes introns, since it's a single interval on the DNA) */ + public GenomeLoc getCodingLocation() { return transcript_coding_interval; } + + /** Name of the gene this transcript corresponds to (NOT gene id such as Entrez etc) */ + public String getGeneName() { return gene_name; } + + /** Number of exons in this transcript */ + public int getNumExons() { return exons.size(); } + + /** Genomic location of the n-th exon; throws an exception if n is out of bounds */ + public GenomeLoc getExonLocation(int n) { + if ( n >= exons.size() || n < 0 ) throw new ReviewedGATKException("Index out-of-bounds. Transcript has " + exons.size() +" exons; requested: "+n); + return exons.get(n); + } + + /** Returns the list of all exons in this transcript, as genomic intervals */ + public List getExons() { return exons; } + + /** Returns all exons falling ::entirely:: inside an interval **/ + public List getExonsInInterval( GenomeLoc interval ) { + List relevantExons = new ArrayList(exons.size()); + for ( GenomeLoc exon : getExons() ) { + if ( interval.containsP(exon) ) { + relevantExons.add(exon); + } + } + + return relevantExons; + } + + /** convenience method; returns the numbers of the exons in the interval **/ + public List getExonNumbersInInterval( GenomeLoc interval ) { + List numbers = new ArrayList(); + int iNo = 0; + for ( GenomeLoc exon : getExons() ) { + if ( interval.containsP(exon) ) { + numbers.add(iNo); + } + iNo++; + } + + return numbers; + } + + public String getTranscriptUniqueGeneName() { + return String.format("%s(%s)",getGeneName(),getTranscriptId()); + } + + public String getOverlapString(GenomeLoc position) { + boolean is_exon = false; + StringBuilder overlapString = new StringBuilder(); + int exonNo = 1; + + for ( GenomeLoc exon : exons ) { + if ( exon.containsP(position) ) { + overlapString.append(String.format("exon_%d",exonNo)); + is_exon = true; + break; + } + exonNo ++; + } + + if ( ! is_exon ) { + if ( overlapsCodingP(position) ) { + overlapString.append("Intron"); + } else { + overlapString.append("UTR"); + } + } + + return overlapString.toString(); + } + + ArrayList exonInRefOrderCache = null; + + public Integer getSortedOverlapInteger(GenomeLoc position) { + int exonNo = -1; + ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); + if ( exonInRefOrderCache == null ) { + Collections.sort(exonsInReferenceOrder); + } + exonInRefOrderCache = exonsInReferenceOrder; + for ( GenomeLoc exon : exonsInReferenceOrder ) { + if ( exon.overlapsP(position) ) { + return ++exonNo; + } + ++exonNo; + } + + return -1; + } + + public GenomeLoc getSortedExonLoc(int offset) { + ArrayList exonsInReferenceOrder = exonInRefOrderCache != null ? exonInRefOrderCache : new ArrayList(exons); + if ( exonInRefOrderCache == null ) { + Collections.sort(exonsInReferenceOrder); + } + exonInRefOrderCache = exonsInReferenceOrder; + return exonsInReferenceOrder.get(offset); + } + + /** Returns true if the specified interval 'that' overlaps with the full genomic interval of this transcript */ + public boolean overlapsP (GenomeLoc that) { + return getLocation().overlapsP(that); + } + + /** Returns true if the specified interval 'that' overlaps with the coding genomic interval of this transcript. + * NOTE: since "coding interval" is still a single genomic interval, it will not contain UTRs of the outermost exons, + * but it will still contain introns and/or exons internal to this genomic locus that are not spliced into this transcript. + * @see #overlapsExonP + */ + public boolean overlapsCodingP (GenomeLoc that) { + return transcript_coding_interval.overlapsP(that); + } + + /** Returns true if the specified interval 'that' overlaps with any of the exons actually spliced into this transcript */ + public boolean overlapsExonP (GenomeLoc that) { + for ( GenomeLoc e : exons ) { + if ( e.overlapsP(that) ) return true; + } + return false; + } + public String toString() { + StringBuilder b = new StringBuilder("000\t"); // first field is unused but required in th ecurrent format; just set to something + b.append(transcript_id); // #1 + b.append('\t'); + b.append(getLocation().getContig()); // #2 + b.append('\t'); + b.append( (strand==1?'+':'-') ); // #3 + b.append('\t'); + b.append( (getLocation().getStart() - 1) ); // #4 + b.append('\t'); + b.append( getLocation().getStop()); // #5 + b.append('\t'); + b.append( (transcript_coding_interval.getStart() - 1) ); // #6 + b.append('\t'); + b.append( transcript_coding_interval.getStop()); // #7 + b.append('\t'); + b.append(exons.size()); // #8 + b.append('\t'); + for ( GenomeLoc loc : exons ) { b.append( (loc.getStart()-1) ); b.append(','); } // #9 + b.append('\t'); + for ( GenomeLoc loc : exons ) { b.append( loc.getStop() ); b.append(','); } // #10 + b.append("\t0\t"); // # 11 - unused? + b.append(gene_name); // # 12 + b.append("\tcmpl\tcmpl\t"); // #13, #14 - unused? + for ( Integer f : exon_frames ) { b.append( f ); b.append(','); } // #15 + + + return b.toString(); + } + + /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to + * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether + * this position is fully whithin an exon of any of those transcripts. Passing null is safe (will return false). + * NOTE: position can be still within a UTR, see #isCoding + * @return true if it's an exon + */ + public static boolean isExon(RODRecordList l) { + + if ( l == null ) return false; + + GenomeLoc loc = l.getLocation(); + + for ( GATKFeature t : l ) { + if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; + } + return false; + + } + + /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to + * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether + * this position is fully whithin a coding region of any of those transcripts. + * Passing null is safe (will return false). + * NOTE: "coding" interval is defined as a single genomic interval, so it + * does not include the UTRs of the outermost exons, but it includes introns between exons spliced into a + * transcript, or internal exons that are not spliced into a given transcript. To check that a position is + * indeed within an exon but not in UTR, use #isCodingExon(). + * @return + */ + public static boolean isCoding(RODRecordList l) { + + if ( l == null ) return false; + + GenomeLoc loc = l.getLocation(); + + for ( GATKFeature t : l ) { + if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) ) return true; + } + return false; + + } + + /** Convenience method, which is packaged here for a lack of better place; it is indeed closely related to + * rodRefSeq though: takes list of rods (transcripts) overlapping with a given position and determines whether + * this position is fully whithin a coding exon portion (i.e. true coding sequence) of any of those transcripts. + * Passing null is safe (will return false). In other words, this method returns true if the list contains a transcript, + * for which the current position is within an exon and within a coding interval simultaneously. + * @return + */ + public static boolean isCodingExon(RODRecordList l) { + + if ( l == null ) return false; + + GenomeLoc loc = l.getLocation(); + + for ( GATKFeature t : l ) { + if ( ((RefSeqFeature)t.getUnderlyingObject()).overlapsCodingP(loc) && ((RefSeqFeature)t.getUnderlyingObject()).overlapsExonP(loc) ) return true; + } + return false; + + } + + + public void setTranscript_id(String transcript_id) { + this.transcript_id = transcript_id; + } + + public void setStrand(int strand) { + this.strand = strand; + } + + public void setTranscript_interval(GenomeLoc transcript_interval) { + this.transcript_interval = transcript_interval; + } + + public void setTranscript_coding_interval(GenomeLoc transcript_coding_interval) { + this.transcript_coding_interval = transcript_coding_interval; + } + + public void setExons(List exons) { + this.exons = exons; + } + + public void setGene_name(String gene_name) { + this.gene_name = gene_name; + } + + public void setExon_frames(List exon_frames) { + this.exon_frames = exon_frames; + } + + public void setName(String name) { + this.name = name; + } + + public String getChr() { + return transcript_interval.getContig(); + } + + public int getStart() { + return transcript_interval.getStart(); + } + + public int getEnd() { + return transcript_interval.getStop(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/Transcript.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/Transcript.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/Transcript.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/refseq/Transcript.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupCodec.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupCodec.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupCodec.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupFeature.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupFeature.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/sampileup/SAMPileupFeature.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadCodec.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadCodec.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadCodec.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadFeature.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadFeature.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/samread/SAMReadFeature.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/BedTableCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/BedTableCodec.java new file mode 100644 index 000000000..cf6cefeb8 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/BedTableCodec.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.codecs.table; + +import org.broadinstitute.gatk.utils.refdata.ReferenceDependentFeatureCodec; + +import java.util.Arrays; + +/** + * The standard table codec that expects loci as contig start stop, not contig:start-stop + * + *

+ * The standard table codec with a slightly different parsing convention + * (expects loci as contig start stop, not contig:start-stop) + *

+ * + *

+ * See also: TableCodec + *

+ * + * @author Chris Hartl + * @since 2010 + */ +public class BedTableCodec extends TableCodec implements ReferenceDependentFeatureCodec { + + @Override + public TableFeature decode(String line) { + if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter) || line.startsWith(igvHeaderDelimiter)) + return null; + String[] split = line.split(delimiterRegex); + if (split.length < 1) + throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format"); + return new TableFeature(genomeLocParser.createGenomeLoc(split[0],Integer.parseInt(split[1])-1,Integer.parseInt(split[2])), Arrays.asList(split),header); + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableCodec.java new file mode 100644 index 000000000..09af2f7a5 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableCodec.java @@ -0,0 +1,126 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.codecs.table; + +import htsjdk.tribble.AsciiFeatureCodec; +import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.gatk.utils.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +/** + * Reads tab deliminated tabular text files + * + *

+ *

    + *
  • Header: must begin with line HEADER or track (for IGV), followed by any number of column names, + * separated by whitespace.
  • + *
  • Comment lines starting with # are ignored
  • + *
  • Each non-header and non-comment line is split into parts by whitespace, + * and these parts are assigned as a map to their corresponding column name in the header. + * Note that the first element (corresponding to the HEADER column) must be a valid genome loc + * such as 1, 1:1 or 1:1-10, which is the position of the Table element on the genome. TableCodec + * requires that there be one value for each column in the header, and no more, on all lines.
  • + *
+ *

+ * + *

+ * + *

File format example

+ *
+ *     HEADER a b c
+ *     1:1  1   2   3
+ *     1:2  4   5   6
+ *     1:3  7   8   9
+ * 
+ * + * @author Mark DePristo + * @since 2009 + */ +public class TableCodec extends AsciiFeatureCodec implements ReferenceDependentFeatureCodec { + final static protected String delimiterRegex = "\\s+"; + final static protected String headerDelimiter = "HEADER"; + final static protected String igvHeaderDelimiter = "track"; + final static protected String commentDelimiter = "#"; + + protected ArrayList header = new ArrayList(); + + /** + * The parser to use when resolving genome-wide locations. + */ + protected GenomeLocParser genomeLocParser; + + public TableCodec() { + super(TableFeature.class); + } + + /** + * Set the parser to use when resolving genetic data. + * @param genomeLocParser The supplied parser. + */ + @Override + public void setGenomeLocParser(GenomeLocParser genomeLocParser) { + this.genomeLocParser = genomeLocParser; + } + + @Override + public TableFeature decode(String line) { + if (line.startsWith(headerDelimiter) || line.startsWith(commentDelimiter) || line.startsWith(igvHeaderDelimiter)) + return null; + String[] split = line.split(delimiterRegex); + if (split.length < 1) + throw new IllegalArgumentException("TableCodec line = " + line + " doesn't appear to be a valid table format"); + return new TableFeature(genomeLocParser.parseGenomeLoc(split[0]),Arrays.asList(split), header); + } + + @Override + public Object readActualHeader(final LineIterator reader) { + boolean isFirst = true; + while (reader.hasNext()) { + final String line = reader.peek(); // Peek to avoid reading non-header data + if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) { + throw new UserException.MalformedFile("TableCodec file does not have a header"); + } + isFirst &= line.startsWith(commentDelimiter); + if (line.startsWith(headerDelimiter)) { + reader.next(); // "Commit" the peek + if (header.size() > 0) throw new IllegalStateException("Input table file seems to have two header lines. The second is = " + line); + final String spl[] = line.split(delimiterRegex); + Collections.addAll(header, spl); + return header; + } else if (line.startsWith(commentDelimiter)) { + reader.next(); // "Commit" the peek + } else { + break; + } + } + return header; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableFeature.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableFeature.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/codecs/table/TableFeature.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/DefaultHashMap.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/DefaultHashMap.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/DefaultHashMap.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/DefaultHashMap.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayList.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayList.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayList.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayList.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/IndexedSet.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/IndexedSet.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/IndexedSet.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/IndexedSet.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/LoggingNestedIntegerArray.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/LoggingNestedIntegerArray.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/LoggingNestedIntegerArray.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/LoggingNestedIntegerArray.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/NestedIntegerArray.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/NestedIntegerArray.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/NestedIntegerArray.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/NestedIntegerArray.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/Pair.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/Pair.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/Pair.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/Pair.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/Permutation.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/Permutation.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/Permutation.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/Permutation.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/PrimitivePair.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/PrimitivePair.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/collections/PrimitivePair.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/PrimitivePair.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/RODMergingIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/RODMergingIterator.java new file mode 100644 index 000000000..e5c7fad4e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/collections/RODMergingIterator.java @@ -0,0 +1,160 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.collections; + +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.PriorityQueue; + +public class RODMergingIterator implements Iterator, Iterable { + PriorityQueue queue = new PriorityQueue(); + + private class Element implements Comparable { + public LocationAwareSeekableRODIterator it = null; + public GenomeLoc nextLoc = null; + + public Element(Iterator it) { + if ( it instanceof LocationAwareSeekableRODIterator) { + this.it = (LocationAwareSeekableRODIterator)it; + if ( ! it.hasNext() ) throw new ReviewedGATKException("Iterator is empty"); + update(); + } else { + throw new ReviewedGATKException("Iterator passed to RODMergingIterator is not LocationAwareSeekableRODIterator"); + } + } + + public Element update() { + // E prev = value; + nextLoc = it.peekNextLocation(); // will return null if there is no next location + return this; + } + + public int compareTo(Element other) { + if ( nextLoc == null ) { + if ( other.nextLoc != null ) return 1; // null means no more data available, so its after any non-null position + return 0; + } + if ( other.nextLoc == null ) return -1; // we can get to this point only if this.nextLoc != null + + return nextLoc.compareTo(other.nextLoc); + } + + public RODRecordList next() { + RODRecordList value = it.next(); + update(); + return value; + } + } + + public Iterator iterator() { + return this; + } + + public RODMergingIterator() { + ; + } + + public RODMergingIterator(Iterator it) { + add(it); + } + + public RODMergingIterator(Collection> its) { + for ( Iterator it : its ) { + add(it); + } + } + + /** If the iterator is non-empty (hasNext() is true), put it into the queue. The next location the iterator + * will be after a call to next() is peeked into and cached as queue's priority value. + * @param it + */ + public void add(Iterator it) { + if ( it.hasNext() ) + queue.add(new Element(it)); + } + + public boolean hasNext() { + return ! queue.isEmpty(); + } + + public RODRecordList next() { + Element e = queue.poll(); + RODRecordList value = e.next(); // next() will also update next location cached by the Element + + if ( e.nextLoc != null ) // we have more data in the track + queue.add(e); // add the element back to queue (note: its next location, on which priority is based, was updated + + //System.out.printf("Element is %s%n", e.value); + return value; + } + + /** Peeks into the genomic location of the record this iterator will return next. + * + * @return + */ + public GenomeLoc peekLocation() { + return queue.peek().nextLoc; + } + + public Collection allElementsLTE(RODRecordList elt) { + return allElementsLTE(elt, true); + } + + public Collection allElementsLTE(RODRecordList elt, boolean includeElt) { + LinkedList all = new LinkedList(); + + if ( includeElt ) all.add(elt); + + while ( hasNext() ) { + Element x = queue.peek(); + //System.out.printf("elt.compareTo(x) == %d%n", elt.compareTo(x)); + //System.out.printf("In allElementLTE%n"); + int cmp = elt.getLocation().compareTo(x.nextLoc); + //System.out.printf("x=%s%n elt=%s%n => elt.compareTo(x) == %d%n", x, elt, cmp); + if ( cmp >= 0 ) { + //System.out.printf(" Adding element x=%s, size = %d%n", x, all.size()); + all.add(next()); + //System.out.printf(" Added size = %d%n", all.size()); + } + else { + //System.out.printf("breaking...%n"); + break; + } + } + + return all; + } + + public void remove() { + throw new UnsupportedOperationException(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Advanced.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Advanced.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Advanced.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Advanced.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Argument.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Argument.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Argument.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Argument.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentCollection.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentCollection.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentCollection.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinition.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinition.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinition.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinition.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitionGroup.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitionGroup.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitionGroup.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitionGroup.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitions.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitions.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitions.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentDefinitions.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentException.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentException.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentException.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentIOType.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentIOType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentIOType.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentIOType.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatch.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatch.java new file mode 100644 index 000000000..e372b4e02 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatch.java @@ -0,0 +1,292 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.commandline; + +import java.util.*; + +/** + * A mapping of all the sites where an argument definition maps to a site on the command line. + */ +public class ArgumentMatch implements Iterable { + /** + * The argument definition that's been matched. + */ + public final ArgumentDefinition definition; + + /** + * The text that's been matched, as it appears in the command line arguments. + */ + public final String label; + + /** + * Maps indices of command line arguments to values paired with that argument. + */ + public final SortedMap> sites = new TreeMap>(); + + /** + * An ordered, freeform collection of tags. + */ + public final Tags tags; + + /** + * Create a new argument match, defining its properties later. Used to create invalid arguments. + */ + public ArgumentMatch() { + this(null,null); + } + + /** + * Minimal constructor for transform function. + * @param label Label of the argument match. Must not be null. + * @param definition The associated definition, if one exists. May be null. + */ + private ArgumentMatch(final String label, final ArgumentDefinition definition) { + this.label = label; + this.definition = definition; + this.tags = new Tags(); + } + + /** + * A simple way of indicating that an argument with the given label and definition exists at this site. + * @param label Label of the argument match. Must not be null. + * @param definition The associated definition, if one exists. May be null. + * @param site Position of the argument. Must not be null. + * @param tags ordered freeform text tags associated with this argument. + */ + public ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final Tags tags) { + this( label, definition, site, null, tags ); + } + + /** + * A simple way of indicating that an argument with the given label and definition exists at this site. + * @param label Label of the argument match. Must not be null. + * @param definition The associated definition, if one exists. May be null. + * @param site Position of the argument. Must not be null. + * @param value Value for the argument at this position. + * @param tags ordered freeform text tags associated with this argument. + */ + private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final ArgumentMatchValue value, final Tags tags) { + this.label = label; + this.definition = definition; + + ArrayList values = new ArrayList(); + if( value != null ) + values.add(value); + sites.put(site,values ); + + this.tags = tags; + } + + /** + * Check to see whether two ArgumentMatch objects are equal. + * @param other Object to which this should be compared. + * @return True if objects are equal, false if objects are not equal or incomparable. + */ + @Override + public boolean equals(Object other) { + // this clearly isn't null, since this.equals() when this == null would result in an NPE. + if(other == null) + return false; + if(!(other instanceof ArgumentMatch)) + return false; + ArgumentMatch otherArgumentMatch = (ArgumentMatch)other; + return this.definition.equals(otherArgumentMatch.definition) && + this.label.equals(otherArgumentMatch.label) && + this.sites.equals(otherArgumentMatch.sites) && + this.tags.equals(otherArgumentMatch.tags); + } + + + /** + * Reformat the given entries with the given multiplexer and key. + * TODO: Generify this. + * @param multiplexer Multiplexer that controls the transformation process. + * @param key Key which specifies the transform. + * @return A variant of this ArgumentMatch with all keys transformed. + */ + @SuppressWarnings("unchecked") + ArgumentMatch transform(Multiplexer multiplexer, Object key) { + SortedMap> newIndices = new TreeMap>(); + for(Map.Entry> site: sites.entrySet()) { + List newEntries = new ArrayList(); + for(ArgumentMatchValue entry: site.getValue()) + newEntries.add(new ArgumentMatchStringValue(multiplexer.transformArgument(key,entry.asString()))); + newIndices.put(site.getKey(),newEntries); + } + ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition); + newArgumentMatch.sites.putAll(newIndices); + return newArgumentMatch; + } + + /** + * Return a string representation of the given argument match, for debugging purposes. + * @return String representation of the match. + */ + public String toString() { + return label; + } + + /** + * Creates an iterator that walks over each individual match at each position of a given argument. + * @return An iterator over the individual matches in this argument. Will not be null. + */ + public Iterator iterator() { + return new Iterator() { + /** + * Iterate over each the available site. + */ + private Iterator siteIterator = null; + + /** + * Iterate over each available token. + */ + private Iterator tokenIterator = null; + + /** + * The next site to return. Null if none remain. + */ + ArgumentMatchSite nextSite = null; + + /** + * The next token to return. Null if none remain. + */ + ArgumentMatchValue nextToken = null; + + { + siteIterator = sites.keySet().iterator(); + prepareNext(); + } + + /** + * Is there a nextToken available to return? + * @return True if there's another token waiting in the wings. False otherwise. + */ + public boolean hasNext() { + return nextSite != null; + } + + /** + * Get the next token, if one exists. If not, throw an IllegalStateException. + * @return The next ArgumentMatch in the series. Should never be null. + */ + public ArgumentMatch next() { + if( nextSite == null ) + throw new IllegalStateException( "No more ArgumentMatches are available" ); + + ArgumentMatch match = new ArgumentMatch( label, definition, nextSite, nextToken, tags ); + prepareNext(); + return match; + } + + /** + * Initialize the next ArgumentMatch to return. If no ArgumentMatches are available, + * initialize nextSite / nextToken to null. + */ + private void prepareNext() { + if( tokenIterator != null && tokenIterator.hasNext() ) { + nextToken = tokenIterator.next(); + } + else { + nextSite = null; + nextToken = null; + + // Do a nested loop. While more data is present in the inner loop, grab that data. + // Otherwise, troll the outer iterator looking for more data. + while( siteIterator.hasNext() ) { + nextSite = siteIterator.next(); + if( sites.get(nextSite) != null ) { + tokenIterator = sites.get(nextSite).iterator(); + nextToken = tokenIterator.hasNext() ? tokenIterator.next() : null; + break; + } + } + } + + } + + /** + * Remove is unsupported in this context. + */ + public void remove() { + throw new UnsupportedOperationException("Cannot remove an argument match from the collection while iterating."); + } + }; + } + + /** + * Merge two ArgumentMatches, so that the values for all arguments go into the + * same data structure. + * @param other The other match to merge into. + */ + public void mergeInto( ArgumentMatch other ) { + sites.putAll(other.sites); + } + + /** + * Associate a value with this merge maapping. + * @param site site of the command-line argument to which this value is mated. + * @param value Text representation of value to add. + */ + public void addValue( ArgumentMatchSite site, ArgumentMatchValue value ) { + if( !sites.containsKey(site) || sites.get(site) == null ) + sites.put(site, new ArrayList() ); + sites.get(site).add(value); + } + + /** + * Does this argument already have a value at the given site? + * Arguments are only allowed to be single-valued per site, and + * flags aren't allowed a value at all. + * @param site Site at which to check for values. + * @return True if the argument has a value at the given site. False otherwise. + */ + public boolean hasValueAtSite( ArgumentMatchSite site ) { + return (sites.get(site) != null && sites.get(site).size() >= 1) || isArgumentFlag(); + } + + /** + * Return the values associated with this argument match. + * @return A collection of the string representation of these value. + */ + public List values() { + final List values = new ArrayList(); + for ( final List siteValue : sites.values() ) { + if ( siteValue != null ) + values.addAll(siteValue); + else + values.add(null); + } + return values; + } + + /** + * Convenience method returning true if the definition is a flag. + * @return True if definition is known to be a flag; false if not known to be a flag. + */ + private boolean isArgumentFlag() { + return definition != null && definition.isFlag; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchFileValue.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchFileValue.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchFileValue.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchFileValue.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSite.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSite.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSite.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSite.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSource.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSource.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSource.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSource.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceType.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceType.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceType.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchStringValue.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchStringValue.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchStringValue.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchStringValue.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchValue.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchValue.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchValue.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchValue.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatches.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatches.java new file mode 100644 index 000000000..e58d8888f --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatches.java @@ -0,0 +1,209 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.commandline; + +import java.util.*; +/** + * Represents a list of potential matches between the arguments defined + * by the argument sources and the arguments passed in via the command line. + */ +public class ArgumentMatches implements Iterable { + /** + * Collection matches from argument definition to argument value. + * Package protected access is deliberate. + */ + Map argumentMatches = new TreeMap(); + + /** + * Provide a place to put command-line argument values that don't seem to belong to + * any particular command-line option. + */ + ArgumentMatch MissingArgument = new ArgumentMatch(); + + /** + * Get an iterator cycling through *unique* command-line argument <-> definition matches. + * @return Iterator over all argument matches. + */ + public Iterator iterator() { + return getUniqueMatches().iterator(); + } + + /** + * Create an empty ArgumentMatches object. + */ + public ArgumentMatches() { + } + + /** + * Create a singleton ArgumentMatches object. + * @param match Match to incorporate. + */ + public ArgumentMatches( ArgumentMatch match ) { + mergeInto( match ); + } + + /** + * Returns the number of matches in this structure. + * @return Count of the matches in this structure. + */ + public int size() { + return argumentMatches.size(); + } + + /** + * Indicates whether the site contains a matched argument. + * @param site Site at which to check. + * @return True if the site has a match. False otherwise. + */ + boolean hasMatch( ArgumentMatchSite site ) { + return argumentMatches.containsKey( site ); + } + + /** + * Gets the match at a given site. + * @param site Site at which to look for a match. + * @return The match present at the given site. + * @throws IllegalArgumentException if site does not contain a match. + */ + ArgumentMatch getMatch( ArgumentMatchSite site ) { + if( !argumentMatches.containsKey(site) ) + throw new IllegalArgumentException( "Site does not contain an argument: " + site ); + return argumentMatches.get(site); + } + + /** + * Does the match collection have a match for this argument definition. + * @param definition Definition to match. + * @return True if a match exists; false otherwise. + */ + boolean hasMatch( ArgumentDefinition definition ) { + return findMatches( definition ).size() > 0; + } + + /** + * Return all argument matches of this source. + * @param parsingEngine Parsing engine. + * @param argumentSource Argument source to match. + * @return List of all matches. + */ + + ArgumentMatches findMatches(ParsingEngine parsingEngine, ArgumentSource argumentSource) { + List sourceDefinitions = parsingEngine.selectBestTypeDescriptor(argumentSource.field.getType()).createArgumentDefinitions(argumentSource); + + ArgumentMatches matches = new ArgumentMatches(); + for( ArgumentMatch argumentMatch: getUniqueMatches() ) { + if( sourceDefinitions.contains(argumentMatch.definition) ) + matches.mergeInto( argumentMatch ); + } + return matches; + } + + /** + * Return all argument matches of this definition. + * @param definition Argument definition to match. + * @return List of all matches. + */ + ArgumentMatches findMatches( ArgumentDefinition definition ) { + ArgumentMatches matches = new ArgumentMatches(); + for( ArgumentMatch argumentMatch: argumentMatches.values() ) { + if( argumentMatch.definition == definition ) + matches.mergeInto( argumentMatch ); + } + return matches; + } + + /** + * Find all successful matches (a 'successful' match is one paired with a definition). + * @return All successful matches. + */ + ArgumentMatches findSuccessfulMatches() { + ArgumentMatches matches = new ArgumentMatches(); + for( ArgumentMatch argumentMatch: argumentMatches.values() ) { + if( argumentMatch.definition != null ) + matches.mergeInto( argumentMatch ); + } + return matches; + } + + /** + * Find arguments that are unmatched to any definition. + * @return Set of matches that have no associated definition. + */ + ArgumentMatches findUnmatched() { + ArgumentMatches matches = new ArgumentMatches(); + for( ArgumentMatch argumentMatch: argumentMatches.values() ) { + if( argumentMatch.definition == null ) + matches.mergeInto( argumentMatch ); + } + return matches; + } + + /** + * Reformat the given entries with the given multiplexer and key. + * TODO: Generify this. + * @param multiplexer Multiplexer that controls the transformation process. + * @param key Key which specifies the transform. + * @return new argument matches. + */ + ArgumentMatches transform(Multiplexer multiplexer, Object key) { + ArgumentMatches newArgumentMatches = new ArgumentMatches(); + for(ArgumentMatch match: argumentMatches.values()) + newArgumentMatches.mergeInto(match.transform(multiplexer,key)); + return newArgumentMatches; + } + + /** + * Merges the given argument match into the set of existing argument matches. + * If multiple arguments are present, those arguments will end up grouped. + * @param match The match to merge into. + */ + void mergeInto( ArgumentMatch match ) { + boolean definitionExists = false; + + // Clone the list of argument matches to avoid ConcurrentModificationExceptions. + for( ArgumentMatch argumentMatch: getUniqueMatches() ) { + if( argumentMatch.definition == match.definition && argumentMatch.tags.equals(match.tags) ) { + argumentMatch.mergeInto( match ); + for( ArgumentMatchSite site: match.sites.keySet() ) + argumentMatches.put( site, argumentMatch ); + definitionExists = true; + } + } + + if( !definitionExists ) { + for( ArgumentMatchSite site: match.sites.keySet() ) + argumentMatches.put( site, match ); + } + } + + /** + * Determines, of the argument matches by position, which are unique and returns that list. + * @return A unique set of matches. + */ + private Set getUniqueMatches() { + return new LinkedHashSet( argumentMatches.values() ); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentSource.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentSource.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentSource.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentSource.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptor.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptor.java new file mode 100644 index 000000000..a1052261e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ArgumentTypeDescriptor.java @@ -0,0 +1,1038 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.commandline; + +import htsjdk.tribble.AbstractFeatureReader; +import org.apache.log4j.Logger; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.text.XReadLines; + +import java.io.File; +import java.io.IOException; +import java.lang.annotation.Annotation; +import java.lang.reflect.*; +import java.util.*; + +/** + * An descriptor capable of providing parsers that can parse any type + * of supported command-line argument. + * + * @author mhanna + * @version 0.1 + */ +public abstract class ArgumentTypeDescriptor { + private static Class[] ARGUMENT_ANNOTATIONS = {Input.class, Output.class, Argument.class}; + + /** + * our log, which we want to capture anything from org.broadinstitute.gatk + */ + protected static final Logger logger = Logger.getLogger(ArgumentTypeDescriptor.class); + + /** + * Fetch the given descriptor from the descriptor repository. + * @param descriptors the descriptors from which to select a good match. + * @param type Class for which to specify a descriptor. + * @return descriptor for the given type. + */ + public static ArgumentTypeDescriptor selectBest( Collection descriptors, Class type ) { + for( ArgumentTypeDescriptor descriptor: descriptors ) { + if( descriptor.supports(type) ) + return descriptor; + } + throw new ReviewedGATKException("Can't process command-line arguments of type: " + type.getName()); + } + + /** + * Returns true if the file will be compressed. + * @param writerFileName Name of the file + * @return true if the file will be compressed. + */ + public static boolean isCompressed(String writerFileName) { + return writerFileName != null && AbstractFeatureReader.hasBlockCompressedExtension(writerFileName); + } + + /** + * Does this descriptor support classes of the given type? + * @param type The type to check. + * @return true if this descriptor supports the given type, false otherwise. + */ + public abstract boolean supports( Class type ); + + /** + * Returns false if a type-specific default can be employed. + * @param source Source of the command-line argument. + * @return True to throw in a type specific default. False otherwise. + */ + public boolean createsTypeDefault(ArgumentSource source) { return false; } + + /** + * Returns a documentation-friendly value for the default of a type descriptor. + * Must be overridden if createsTypeDefault return true. cannot be called otherwise + * @param source Source of the command-line argument. + * @return Friendly string of the default value, for documentation. If doesn't create a default, throws + * and UnsupportedOperationException + */ + public String typeDefaultDocString(ArgumentSource source) { + throw new UnsupportedOperationException(); + } + + /** + * Generates a default for the given type. + * + * @param parsingEngine the parsing engine used to validate this argument type descriptor. + * @param source Source of the command-line argument. + * @param type Type of value to create, in case the command-line argument system wants influence. + * @return A default value for the given type. + */ + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { throw new UnsupportedOperationException("Unable to create default for type " + getClass()); } + + /** + * Given the given argument source and attributes, synthesize argument definitions for command-line arguments. + * @param source Source class and field for the given argument. + * @return A list of command-line argument definitions supporting this field. + */ + public List createArgumentDefinitions( ArgumentSource source ) { + return Collections.singletonList(createDefaultArgumentDefinition(source)); + } + + /** + * Parses an argument source to an object. + * WARNING! Mandatory side effect of parsing! Each parse routine should register the tags it finds with the proper CommandLineProgram. + * TODO: Fix this, perhaps with an event model indicating that a new argument has been created. + * + * @param parsingEngine The engine responsible for parsing. + * @param source The source used to find the matches. + * @param matches The matches for the source. + * @return The parsed object. + */ + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, ArgumentMatches matches) { + return parse(parsingEngine, source, source.field.getGenericType(), matches); + } + + /** + * Returns true if the field is a collection or an array. + * @param source The argument source to check. + * @return true if the field is a collection or an array. + */ + public boolean isMultiValued( ArgumentSource source ) { + Class argumentType = source.field.getType(); + return Collection.class.isAssignableFrom(argumentType) || argumentType.isArray(); + } + + /** + * By default, argument sources create argument definitions with a set of default values. + * Use this method to create the one simple argument definition. + * @param source argument source for which to create a default definition. + * @return The default definition for this argument source. + */ + protected ArgumentDefinition createDefaultArgumentDefinition( ArgumentSource source ) { + Annotation argumentAnnotation = getArgumentAnnotation(source); + return new ArgumentDefinition( ArgumentIOType.getIOType(argumentAnnotation), + source.field.getType(), + ArgumentDefinition.getFullName(argumentAnnotation, source.field.getName()), + ArgumentDefinition.getShortName(argumentAnnotation), + ArgumentDefinition.getDoc(argumentAnnotation), + source.isRequired() && !createsTypeDefault(source) && !source.isFlag() && !source.isDeprecated(), + source.isFlag(), + source.isMultiValued(), + source.isHidden(), + makeRawTypeIfNecessary(getCollectionComponentType(source.field)), + ArgumentDefinition.getExclusiveOf(argumentAnnotation), + ArgumentDefinition.getValidationRegex(argumentAnnotation), + getValidOptions(source) ); + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + protected Type getCollectionComponentType( Field field ) { + return null; + } + + /** + * Parses the argument matches for a class type into an object. + * @param source The original argument source used to find the matches. + * @param type The current class type being inspected. May not match the argument source.field.getType() if this as a collection for example. + * @param matches The argument matches for the argument source, or the individual argument match for a scalar if this is being called to help parse a collection. + * @return The individual parsed object matching the argument match with Class type. + */ + public abstract Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ); + + /** + * If the argument source only accepts a small set of options, populate the returned list with + * those options. Otherwise, leave the list empty. + * @param source Original field specifying command-line arguments. + * @return A list of valid options. + */ + protected List getValidOptions( ArgumentSource source ) { + if(!source.field.getType().isEnum()) + return null; + List validOptions = new ArrayList(); + for(Object constant: source.field.getType().getEnumConstants()) + validOptions.add(constant.toString()); + return validOptions; + } + + /** + * Returns true if the argument with the given full name exists in the collection of ArgumentMatches. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return true if the argument is present, or false if not present. + */ + protected boolean argumentIsPresent( ArgumentDefinition definition, ArgumentMatches matches ) { + for( ArgumentMatch match: matches ) { + if( match.definition.equals(definition) ) + return true; + } + return false; + } + + /** + * Gets the value of an argument with the given full name, from the collection of ArgumentMatches. + * If the argument matches multiple values, an exception will be thrown. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or null if not present. + */ + protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) { + Collection argumentValues = getArgumentValues( definition, matches ); + if( argumentValues.size() > 1 ) + throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName); + return argumentValues.size() > 0 ? argumentValues.iterator().next() : null; + } + + /** + * Gets the tags associated with a given command-line argument. + * If the argument matches multiple values, an exception will be thrown. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or null if not present. + */ + protected Tags getArgumentTags(ArgumentMatches matches) { + Tags tags = new Tags(); + for(ArgumentMatch match: matches) { + if(!tags.isEmpty() && !match.tags.isEmpty()) + throw new ReviewedGATKException("BUG: multiple conflicting sets of tags are available, and the type descriptor specifies no way of resolving the conflict."); + tags = match.tags; + } + return tags; + } + + /** + * Gets the values of an argument with the given full name, from the collection of ArgumentMatches. + * @param definition Definition of the argument for which to find matches. + * @param matches The matches for the given argument. + * @return The value of the argument if available, or an empty collection if not present. + */ + protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) { + Collection values = new ArrayList(); + for( ArgumentMatch match: matches ) { + if( match.definition.equals(definition) ) + values.addAll(match.values()); + } + return values; + } + + /** + * Retrieves the argument description from the given argument source. Will throw an exception if + * the given ArgumentSource + * @param source source of the argument. + * @return Argument description annotation associated with the given field. + */ + @SuppressWarnings("unchecked") + protected static Annotation getArgumentAnnotation( ArgumentSource source ) { + for (Class annotation: ARGUMENT_ANNOTATIONS) + if (source.field.isAnnotationPresent(annotation)) + return source.field.getAnnotation(annotation); + throw new ReviewedGATKException("ArgumentAnnotation is not present for the argument field: " + source.field.getName()); + } + + /** + * Returns true if an argument annotation is present + * @param field The field to check for an annotation. + * @return True if an argument annotation is present on the field. + */ + @SuppressWarnings("unchecked") + public static boolean isArgumentAnnotationPresent(Field field) { + for (Class annotation: ARGUMENT_ANNOTATIONS) + if (field.isAnnotationPresent(annotation)) + return true; + return false; + } + + /** + * Returns true if the given annotation is hidden from the help system. + * @param field Field to test. + * @return True if argument should be hidden. False otherwise. + */ + public static boolean isArgumentHidden(Field field) { + return field.isAnnotationPresent(Hidden.class); + } + + public static Class makeRawTypeIfNecessary(Type t) { + if ( t == null ) + return null; + else if ( t instanceof ParameterizedType ) + return (Class)((ParameterizedType) t).getRawType(); + else if ( t instanceof Class ) { + return (Class)t; + } else { + throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t); + } + } + + /** + * The actual argument parsing method. + * @param source source + * @param type type to check + * @param matches matches + * @param tags argument tags + * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. + */ + protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + return parseBinding(value, parameterType, type, name, tags, source.field.getName()); + } + + /** + * + * @param value The source of the binding + * @param parameterType The Tribble Feature parameter type + * @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding. + * @param bindingName The name of the binding passed to the constructor. + * @param tags Tags for the binding used for parsing and passed to the constructor. + * @param fieldName The name of the field that was parsed. Used for error reporting. + * @return The newly created binding object of type bindingClass. + */ + public static Object parseBinding(ArgumentMatchValue value, Class parameterType, Type bindingClass, + String bindingName, Tags tags, String fieldName) { + try { + String tribbleType = null; + // must have one or two tag values here + if ( tags.getPositionalTags().size() > 2 ) { + throw new UserException.CommandLineException( + String.format("Unexpected number of positional tags for argument %s : %s. " + + "Rod bindings only support -X:type and -X:name,type argument styles", + value.asString(), fieldName)); + } else if ( tags.getPositionalTags().size() == 2 ) { + // -X:name,type style + bindingName = tags.getPositionalTags().get(0); + tribbleType = tags.getPositionalTags().get(1); + + FeatureManager manager = new FeatureManager(); + if ( manager.getByName(tribbleType) == null ) + throw new UserException.UnknownTribbleType( + tribbleType, + String.format("Unable to find tribble type '%s' provided on the command line. " + + "Please select a correct type from among the supported types:%n%s", + tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType))); + + } else { + // case with 0 or 1 positional tags + FeatureManager manager = new FeatureManager(); + + // -X:type style is a type when we cannot determine the type dynamically + String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null; + if ( tag1 != null ) { + if ( manager.getByName(tag1) != null ) // this a type + tribbleType = tag1; + else + bindingName = tag1; + } + + if ( tribbleType == null ) { + // try to determine the file type dynamically + File file = value.asFile(); + if ( file.canRead() && file.isFile() ) { + FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); + if ( featureDescriptor != null ) { + tribbleType = featureDescriptor.getName(); + logger.debug("Dynamically determined type of " + file + " to be " + tribbleType); + } + } + + if ( tribbleType == null ) { + // IntervalBinding can be created from a normal String + Class rawType = (makeRawTypeIfNecessary(bindingClass)); + try { + return rawType.getConstructor(String.class).newInstance(value.asString()); + } catch (NoSuchMethodException e) { + /* ignore */ + } + + if ( ! file.exists() ) { + throw new UserException.CouldNotReadInputFile(file, "file does not exist"); + } else if ( ! file.canRead() || ! file.isFile() ) { + throw new UserException.CouldNotReadInputFile(file, "file could not be read"); + } else { + throw new UserException.CommandLineException( + String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " + + "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", + manager.userFriendlyListOfAvailableFeatures(parameterType))); + } + } + } + } + + Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); + return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags); + } catch (Exception e) { + if ( e instanceof UserException ) + throw ((UserException)e); + else + throw new UserException.CommandLineException( + String.format("Failed to parse value %s for argument %s. Message: %s", + value, fieldName, e.getMessage())); + } + } + + /** + * Parse the source of a RodBindingCollection, which can be either a file of RodBindings or an actual RodBinding. + * + * @param parsingEngine the parsing engine used to validate this argument type descriptor + * @param source source + * @param type type + * @param matches matches + * @param tags argument tags + * @return the newly created binding object + */ + public Object parseRodBindingCollectionSource(final ParsingEngine parsingEngine, + final ArgumentSource source, + final Type type, + final ArgumentMatches matches, + final Tags tags) { + + final ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + final ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + @SuppressWarnings("unchecked") + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + String name = defaultDefinition.fullName; + + // if this a list of files, get those bindings + final File file = value.asFile(); + try { + if (file.getAbsolutePath().endsWith(".list")) { + return getRodBindingsCollection(file, parsingEngine, parameterType, name, tags, source.field.getName()); + } + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + + // otherwise, treat this as an individual binding + final RodBinding binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, name, tags, source.field.getName()); + parsingEngine.addTags(binding, tags); + parsingEngine.addRodBinding(binding); + return RodBindingCollection.createRodBindingCollectionOfType(parameterType, Arrays.asList(binding)); + } + + /** + * Retrieve and parse a collection of RodBindings from the given file. + * + * If the file contains duplicate entries or is empty, an exception will be thrown. + * + * @param file the source file + * @param parsingEngine the engine responsible for parsing + * @param parameterType the Tribble Feature parameter type + * @param bindingName the name of the binding passed to the constructor. + * @param defaultTags general tags for the binding used for parsing and passed to the constructor. + * @param fieldName the name of the field that was parsed. Used for error reporting. + * @return the newly created collection of binding objects. + */ + public static Object getRodBindingsCollection(final File file, + final ParsingEngine parsingEngine, + final Class parameterType, + final String bindingName, + final Tags defaultTags, + final String fieldName) throws IOException { + final List bindings = new ArrayList<>(); + + // Keep track of the files in this list so that we can check for duplicates and empty files + final Set fileValues = new HashSet<>(); + + // parse each line separately using the given Tags if none are provided on each line + for ( final String line: new XReadLines(file) ) { + final String[] tokens = line.split("\\s+"); + final RodBinding binding; + + if ( tokens.length == 0 ) { + continue; // empty line, so do nothing + } + // use the default tags if none are provided for this binding + else if ( tokens.length == 1 ) { + final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[0], fileValues, fieldName, file.getName()); + binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, defaultTags, fieldName); + parsingEngine.addTags(binding, defaultTags); + + } + // use the new tags if provided + else if ( tokens.length == 2 ) { + final Tags tags = ParsingMethod.parseTags(fieldName, tokens[0]); + final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[1], fileValues, fieldName, file.getName()); + binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, tags, fieldName); + parsingEngine.addTags(binding, tags); + } else { + throw new UserException.BadArgumentValue(fieldName, "data lines should consist of an optional set of tags along with a path to a file; too many tokens are present for line: " + line); + } + + bindings.add(binding); + parsingEngine.addRodBinding(binding); + } + + if (fileValues.isEmpty()) { + throw new UserException.BadArgumentValue(fieldName, "The input list " + file.getName() + " is empty."); + } + + return RodBindingCollection.createRodBindingCollectionOfType(parameterType, bindings); + } + + /** + * Validates the resource file name and constructs an ArgumentMatchValue from it. + * + * If the list name has already been processed in the current list, throws a UserException, otherwise + * creates an ArgumentMatchValue to represent the list. + * + * @param token Name of the ROD resource file. + * @param fileValues Set of names of ROD files that have already been processed. + * @param fieldName Name of the argument field being populated. + * @param listFileName Name of the list file being processed. + * @return + */ + private static ArgumentMatchValue parseAndValidateArgumentMatchValue(final String token, final Set fileValues, final String fieldName, + final String listFileName) { + checkForDuplicateFileName(token, fileValues, fieldName, listFileName); + return new ArgumentMatchStringValue(token); + } + + /** + * Checks to make sure that the current file name to be processed has not already been processed. + * + * Checks the name of the current file against the names that have already been processed, throwing + * an informative BadArgumentValue exception if it has already been seen. As a side effect adds the + * current file name to the set of filenames that have already been processed. + * + * @param currentFile Name of the current file to process + * @param processedFiles Set of file names that have already been processed + * @param fieldName Name of the argument that is being populated + * @param listName Filename of the list that is being processed + */ + protected static void checkForDuplicateFileName(final String currentFile, final Set processedFiles, + final String fieldName, final String listName) { + if (processedFiles.contains(currentFile)) { + throw new UserException.BadArgumentValue(fieldName, "The input list " + listName + " contains file " + currentFile + + " multiple times, which isn't allowed. If you are intentionally trying to " + + "include the same file more than once, you will need to specify it in separate file lists."); + } + processedFiles.add(currentFile); + } +} + +/** + * Parser for RodBinding objects + */ +class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBinding class objects + * @param type The type to check. + * @return true if the provided class is a RodBinding.class + */ + @Override + public boolean supports( Class type ) { + return isRodBinding(type); + } + + public static boolean isRodBinding( Class type ) { + return RodBinding.class.isAssignableFrom(type); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); } + + @Override + @SuppressWarnings("unchecked") + public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) { + Class parameterType = JVMUtils.getParameterizedTypeClass(type); + return RodBinding.makeUnbound((Class)parameterType); + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "none"; + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + Tags tags = getArgumentTags(matches); + RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags); + parsingEngine.addTags(rbind, tags); + parsingEngine.addRodBinding(rbind); + return rbind; + } +} + +/** + * Parser for IntervalBinding objects + */ +class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want IntervalBinding class objects + * @param type The type to check. + * @return true if the provided class is an IntervalBinding.class + */ + @Override + public boolean supports( Class type ) { + return isIntervalBinding(type); + } + + public static boolean isIntervalBinding( Class type ) { + return IntervalBinding.class.isAssignableFrom(type); + } + + /** + * See note from RodBindingArgumentTypeDescriptor.parse(). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @return the IntervalBinding object. + */ + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + return parseBinding(source, type, matches, getArgumentTags(matches)); + } +} + +/** + * Parser for RodBindingCollection objects + */ +class RodBindingCollectionArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want RodBindingCollection class objects + * @param type The type to check. + * @return true if the provided class is an RodBindingCollection.class + */ + @Override + public boolean supports( final Class type ) { + return isRodBindingCollection(type); + } + + public static boolean isRodBindingCollection( final Class type ) { + return RodBindingCollection.class.isAssignableFrom(type); + } + + /** + * See note from RodBindingArgumentTypeDescriptor.parse(). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @return the IntervalBinding object. + */ + @Override + public Object parse(final ParsingEngine parsingEngine, final ArgumentSource source, final Type type, final ArgumentMatches matches) { + final Tags tags = getArgumentTags(matches); + return parseRodBindingCollectionSource(parsingEngine, source, type, matches, tags); + } +} + +/** + * Parse simple argument types: java primitives, wrapper classes, and anything that has + * a simple String constructor. + */ +class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { + + /** + * @param type the class type + * @return true if this class is a binding type, false otherwise + */ + private boolean isBinding(final Class type) { + return RodBindingArgumentTypeDescriptor.isRodBinding(type) || + IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) || + RodBindingCollectionArgumentTypeDescriptor.isRodBindingCollection(type); + } + + + @Override + public boolean supports( Class type ) { + if ( isBinding(type) ) return false; + if ( type.isPrimitive() ) return true; + if ( type.isEnum() ) return true; + if ( primitiveToWrapperMap.containsValue(type) ) return true; + + try { + type.getConstructor(String.class); + return true; + } + catch( Exception ex ) { + // An exception thrown above means that the String constructor either doesn't + // exist or can't be accessed. In either case, this descriptor doesn't support this type. + return false; + } + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); + if (source.isFlag()) + return true; + + ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); + ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches); + Object result; + Tags tags = getArgumentTags(matches); + + // lets go through the types we support + try { + if (type.isPrimitive()) { + Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class); + if(value == null) + throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); + result = valueOf.invoke(null,value.asString().trim()); + } else if (type.isEnum()) { + Object[] vals = type.getEnumConstants(); + Object defaultEnumeration = null; // as we look at options, record the default option if it exists + for (Object val : vals) { + if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val; + try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } + catch (NoSuchFieldException e) { throw new ReviewedGATKException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } + } + // if their argument has no value (null), and there's a default, return that default for the enum value + if (defaultEnumeration != null && value == null) + result = defaultEnumeration; + // if their argument has no value and there's no default, throw a missing argument value exception. + // TODO: Clean this up so that null values never make it to this point. To fix this, we'll have to clean up the implementation of -U. + else if (value == null) + throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); + else + throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString()); + } else if (type.equals(File.class)) { + result = value == null ? null : value.asFile(); + } else { + if (value == null) + throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); + Constructor ctor = type.getConstructor(String.class); + result = ctor.newInstance(value.asString()); + } + } catch (UserException e) { + throw e; + } catch (InvocationTargetException e) { + throw new UserException.CommandLineException(String.format("Failed to parse value %s for argument %s. This is most commonly caused by providing an incorrect data type (e.g. a double when an int is required)", + value, source.field.getName())); + } catch (Exception e) { + throw new DynamicClassResolutionException(String.class, e); + } + + // TODO FIXME! + + // WARNING: Side effect! + parsingEngine.addTags(result,tags); + + return result; + } + + + /** + * A mapping of the primitive types to their associated wrapper classes. Is there really no way to infer + * this association available in the JRE? + */ + private static Map primitiveToWrapperMap = new HashMap() { + { + put( Boolean.TYPE, Boolean.class ); + put( Character.TYPE, Character.class ); + put( Byte.TYPE, Byte.class ); + put( Short.TYPE, Short.class ); + put( Integer.TYPE, Integer.class ); + put( Long.TYPE, Long.class ); + put( Float.TYPE, Float.class ); + put( Double.TYPE, Double.class ); + } + }; +} + +/** + * Process compound argument types: arrays, and typed and untyped collections. + */ +class CompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { + @Override + public boolean supports( Class type ) { + return ( Collection.class.isAssignableFrom(type) || type.isArray() ); + } + + @Override + @SuppressWarnings("unchecked") + public Object parse(ParsingEngine parsingEngine,ArgumentSource source, Type fulltype, ArgumentMatches matches) { + Class type = makeRawTypeIfNecessary(fulltype); + Type componentType; + Object result; + + if( Collection.class.isAssignableFrom(type) ) { + + // If this is a generic interface, pick a concrete implementation to create and pass back. + // Because of type erasure, don't worry about creating one of exactly the correct type. + if( Modifier.isInterface(type.getModifiers()) || Modifier.isAbstract(type.getModifiers()) ) + { + if( java.util.List.class.isAssignableFrom(type) ) type = ArrayList.class; + else if( java.util.Queue.class.isAssignableFrom(type) ) type = java.util.ArrayDeque.class; + else if( java.util.Set.class.isAssignableFrom(type) ) type = java.util.TreeSet.class; + } + + componentType = getCollectionComponentType( source.field ); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); + + Collection collection; + try { + collection = (Collection)type.newInstance(); + } + catch (InstantiationException e) { + logger.fatal("ArgumentParser: InstantiationException: cannot convert field " + source.field.getName()); + throw new ReviewedGATKException("constructFromString:InstantiationException: Failed conversion " + e.getMessage()); + } + catch (IllegalAccessException e) { + logger.fatal("ArgumentParser: IllegalAccessException: cannot convert field " + source.field.getName()); + throw new ReviewedGATKException("constructFromString:IllegalAccessException: Failed conversion " + e.getMessage()); + } + + for( ArgumentMatch match: matches ) { + for( ArgumentMatch value: match ) { + Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); + collection.add( object ); + // WARNING: Side effect! + parsingEngine.addTags(object,value.tags); + } + } + + result = collection; + + } + else if( type.isArray() ) { + componentType = type.getComponentType(); + ArgumentTypeDescriptor componentArgumentParser = parsingEngine.selectBestTypeDescriptor(makeRawTypeIfNecessary(componentType)); + + // Assemble a collection of individual values used in this computation. + Collection values = new ArrayList(); + for( ArgumentMatch match: matches ) + for( ArgumentMatch value: match ) + values.add(value); + + result = Array.newInstance(makeRawTypeIfNecessary(componentType),values.size()); + + int i = 0; + for( ArgumentMatch value: values ) { + Object object = componentArgumentParser.parse(parsingEngine,source,componentType,new ArgumentMatches(value)); + Array.set(result,i++,object); + // WARNING: Side effect! + parsingEngine.addTags(object,value.tags); + } + } + else + throw new ReviewedGATKException("Unsupported compound argument type: " + type); + + return result; + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + @Override + protected Type getCollectionComponentType( Field field ) { + // If this is a parameterized collection, find the contained type. If blow up if more than one type exists. + if( field.getGenericType() instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); + if( parameterizedType.getActualTypeArguments().length > 1 ) + throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); + return parameterizedType.getActualTypeArguments()[0]; + } + else + return String.class; + } +} + +class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * The multiplexer controlling how data is split. + */ + private final Multiplexer multiplexer; + + /** + * The set of identifiers for the multiplexed entries. + */ + private final Collection multiplexedIds; + + public MultiplexArgumentTypeDescriptor() { + this.multiplexer = null; + this.multiplexedIds = null; + } + + /** + * Private constructor to use in creating a closure of the MultiplexArgumentTypeDescriptor specific to the + * given set of multiplexed ids. + * @param multiplexedIds The collection of multiplexed entries + */ + private MultiplexArgumentTypeDescriptor(final Multiplexer multiplexer, final Collection multiplexedIds) { + this.multiplexer = multiplexer; + this.multiplexedIds = multiplexedIds; + } + + @Override + public boolean supports( Class type ) { + return ( Map.class.isAssignableFrom(type) ); + } + + @Override + public boolean createsTypeDefault(ArgumentSource source) { + // Multiplexing always creates a type default. + return true; + } + + @Override + public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) { + if(multiplexer == null || multiplexedIds == null) + throw new ReviewedGATKException("No multiplexed ids available"); + + Map multiplexedMapping = new HashMap(); + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); + ArgumentTypeDescriptor componentTypeDescriptor = parsingEngine.selectBestTypeDescriptor(componentType); + + for(Object id: multiplexedIds) { + Object value = null; + if(componentTypeDescriptor.createsTypeDefault(source)) + value = componentTypeDescriptor.createTypeDefault(parsingEngine,source,componentType); + multiplexedMapping.put(id,value); + } + return multiplexedMapping; + } + + @Override + public String typeDefaultDocString(ArgumentSource source) { + return "None"; + } + + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + if(multiplexedIds == null) + throw new ReviewedGATKException("Cannot directly parse a MultiplexArgumentTypeDescriptor; must create a derivative type descriptor first."); + + Map multiplexedMapping = new HashMap(); + + Class componentType = makeRawTypeIfNecessary(getCollectionComponentType(source.field)); + + + for(Object id: multiplexedIds) { + Object value = parsingEngine.selectBestTypeDescriptor(componentType).parse(parsingEngine,source,componentType,matches.transform(multiplexer,id)); + multiplexedMapping.put(id,value); + } + + parsingEngine.addTags(multiplexedMapping,getArgumentTags(matches)); + + return multiplexedMapping; + } + + public MultiplexArgumentTypeDescriptor createCustomTypeDescriptor(ParsingEngine parsingEngine,ArgumentSource dependentArgument,Object containingObject) { + String[] sourceFields = dependentArgument.field.getAnnotation(Multiplex.class).arguments(); + + List allSources = parsingEngine.extractArgumentSources(containingObject.getClass()); + Class[] sourceTypes = new Class[sourceFields.length]; + Object[] sourceValues = new Object[sourceFields.length]; + int currentField = 0; + + for(String sourceField: sourceFields) { + boolean fieldFound = false; + for(ArgumentSource source: allSources) { + if(!source.field.getName().equals(sourceField)) + continue; + if(source.field.isAnnotationPresent(Multiplex.class)) + throw new ReviewedGATKException("Command-line arguments can only depend on independent fields"); + sourceTypes[currentField] = source.field.getType(); + sourceValues[currentField] = JVMUtils.getFieldValue(source.field,containingObject); + currentField++; + fieldFound = true; + } + if(!fieldFound) + throw new ReviewedGATKException(String.format("Unable to find source field %s, referred to by dependent field %s",sourceField,dependentArgument.field.getName())); + } + + Class multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value(); + Constructor multiplexerConstructor; + try { + multiplexerConstructor = multiplexerType.getConstructor(sourceTypes); + multiplexerConstructor.setAccessible(true); + } + catch(NoSuchMethodException ex) { + throw new ReviewedGATKException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + + Multiplexer multiplexer; + try { + multiplexer = multiplexerConstructor.newInstance(sourceValues); + } + catch(IllegalAccessException ex) { + throw new ReviewedGATKException(String.format("Constructor for class %s with parameters %s is inaccessible",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + catch(InstantiationException ex) { + throw new ReviewedGATKException(String.format("Can't create class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + catch(InvocationTargetException ex) { + throw new ReviewedGATKException(String.format("Can't invoke constructor of class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex); + } + + return new MultiplexArgumentTypeDescriptor(multiplexer,multiplexer.multiplex()); + } + + /** + * Return the component type of a field, or String.class if the type cannot be found. + * @param field The reflected field to inspect. + * @return The parameterized component type, or String.class if the parameterized type could not be found. + * @throws IllegalArgumentException If more than one parameterized type is found on the field. + */ + @Override + protected Type getCollectionComponentType( Field field ) { + // Multiplex arguments must resolve to maps from which the clp should extract the second type. + if( field.getGenericType() instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType)field.getGenericType(); + if( parameterizedType.getActualTypeArguments().length != 2 ) + throw new IllegalArgumentException("Unable to determine collection type of field: " + field.toString()); + return (Class)parameterizedType.getActualTypeArguments()[1]; + } + else + return String.class; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ClassType.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ClassType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ClassType.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ClassType.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java new file mode 100644 index 000000000..0c3cbecc7 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineProgram.java @@ -0,0 +1,458 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.commandline; + +import org.apache.log4j.FileAppender; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.help.ApplicationDetails; +import org.broadinstitute.gatk.utils.help.HelpConstants; +import org.broadinstitute.gatk.utils.help.HelpFormatter; +import org.broadinstitute.gatk.utils.text.TextFormattingUtils; + +import java.io.IOException; +import java.util.*; + +public abstract class CommandLineProgram { + + /** The command-line program and the arguments it returned. */ + public ParsingEngine parser = null; + + /** + * Setting INFO gets you INFO up to FATAL, setting ERROR gets you ERROR and FATAL level logging, and so on. + */ + @Argument(fullName = "logging_level", shortName = "l", doc = "Set the minimum level of logging", required = false) + protected String logging_level = "INFO"; + + /** + * File to save the logging output. + */ + @Output(fullName = "log_to_file", shortName = "log", doc = "Set the logging location", required = false) + protected String toFile = null; + + /** + * This will produce a help message in the terminal with general usage information, listing available arguments + * as well as tool-specific information if applicable. + */ + @Argument(fullName = "help", shortName = "h", doc = "Generate the help message", required = false) + public Boolean help = false; + + /** + * Use this to check the version number of the GATK executable you are invoking. Note that the version number is + * always included in the output at the start of every run as well as any error message. + */ + @Argument(fullName = "version", shortName = "version", doc ="Output version information", required = false) + public Boolean version = false; + + + /** our logging output patterns */ + private static final String patternString = "%-5p %d{HH:mm:ss,SSS} %C{1} - %m %n"; + + static { + /** + * The very first thing that any GATK application does is forces the JVM locale into US English, so that we don't have + * to think about number formatting issues. + */ + forceJVMLocaleToUSEnglish(); + // setup a basic log configuration + CommandLineUtils.configureConsoleLogging(); + } + + + /** + * Allows a given application to return a brief description of itself. + * + * @return An ApplicationDetails object describing the current application. Should not be null. + */ + protected ApplicationDetails getApplicationDetails() { + return new ApplicationDetails(ApplicationDetails.createDefaultHeader(getClass()), + Collections.emptyList(), + ApplicationDetails.createDefaultRunningInstructions(getClass()), + null); + } + + /** + * Subclasses of CommandLinePrograms can provide their own types of command-line arguments. + * @return A collection of type descriptors generating implementation-dependent placeholders. + */ + protected Collection getArgumentTypeDescriptors() { + return Collections.emptyList(); + } + + /** + * Will this application want to vary its argument list dynamically? + * If so, parse the command-line options and then prompt the subclass to return + * a list of argument providers. + * + * @return Whether the application should vary command-line arguments dynamically. + */ + protected boolean canAddArgumentsDynamically() { return false; } + + /** + * Provide a list of object to inspect, looking for additional command-line arguments. + * + * @return A list of objects to inspect. + */ + protected Class[] getArgumentSources() { + return new Class[]{}; + } + + /** + * Name this argument source. Provides the (full) class name as a default. + * + * @param source The argument source. + * + * @return a name for the argument source. + */ + protected String getArgumentSourceName( Class source ) { return source.toString(); } + + /** + * Sets the command-line parsing engine. Necessary for unit testing purposes. + * @param parser the new command-line parsing engine + */ + public void setParser( ParsingEngine parser ) { + this.parser = parser; + } + + /** + * this is the function that the inheriting class can expect to have called + * when all the argument processing is done + * + * @return the return code to exit the program with + * @throws Exception when an exception occurs + */ + protected abstract int execute() throws Exception; + + public static int result = -1; + + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args) throws Exception { + start(clp, args, false); + } + + /** + * This function is called to start processing the command line, and kick + * off the execute message of the program. + * + * @param clp the command line program to execute + * @param args the command line arguments passed in + * @param dryRun dry run + * @throws Exception when an exception occurs + */ + @SuppressWarnings("unchecked") + public static void start(CommandLineProgram clp, String[] args, boolean dryRun) throws Exception { + + try { + // setup our log layout + PatternLayout layout = new PatternLayout(); + + Logger logger = CommandLineUtils.getStingLogger(); + + // now set the layout of all the loggers to our layout + CommandLineUtils.setLayout(logger, layout); + + // Initialize the logger using the defaults. + clp.setupLoggerLevel(layout); + + // setup the parser + ParsingEngine parser = clp.parser = new ParsingEngine(clp); + parser.addArgumentSource(clp.getClass()); + + Map parsedArgs; + + // process the args + if (clp.canAddArgumentsDynamically()) { + // if the command-line program can toss in extra args, fetch them and reparse the arguments. + parser.parse(args); + + // Allow invalid and missing required arguments to pass this validation step. + // - InvalidArgument in case these arguments are specified by plugins. + // - MissingRequiredArgument in case the user requested help. Handle that later, once we've + // determined the full complement of arguments. + if ( ! dryRun ) + parser.validate(EnumSet.of(ParsingEngine.ValidationType.MissingRequiredArgument, + ParsingEngine.ValidationType.InvalidArgument)); + parser.loadArgumentsIntoObject(clp); + + // Initialize the logger using the loaded command line. + clp.setupLoggerLevel(layout); + + Class[] argumentSources = clp.getArgumentSources(); + for (Class argumentSource : argumentSources) + parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource); + parsedArgs = parser.parse(args); + + if (isVersionPresent(parser)) + printVersionAndExit(); + + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); + + if ( ! dryRun ) parser.validate(); + } else { + parsedArgs = parser.parse(args); + + if ( ! dryRun ) { + if (isHelpPresent(parser)) + printHelpAndExit(clp, parser); + + parser.validate(); + } + parser.loadArgumentsIntoObject(clp); + + // Initialize the logger using the loaded command line. + clp.setupLoggerLevel(layout); + } + + if ( ! dryRun ) { + // if they specify a log location, output our data there + if (clp.toFile != null) { + FileAppender appender; + try { + appender = new FileAppender(layout, clp.toFile, false); + logger.addAppender(appender); + } catch (IOException e) { + throw new RuntimeException("Unable to re-route log output to " + clp.toFile + " make sure the destination exists"); + } + } + + // regardless of what happens next, generate the header information + HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs); + + // call the execute + CommandLineProgram.result = clp.execute(); + } + } + catch (ArgumentException e) { + //clp.parser.printHelp(clp.getApplicationDetails()); + // Rethrow the exception to exit with an error. + throw e; + } + } + + /** + * Find fields in the object obj that look like command-line arguments, and put command-line + * arguments into them. + * + * @param obj Object to inspect for command line arguments. + */ + public void loadArgumentsIntoObject(Object obj) { + parser.loadArgumentsIntoObject(obj); + } + + /** + * this function checks the logger level passed in on the command line, taking the lowest + * level that was provided. + * @param layout Pattern layout to format based on the logger level. + */ + private void setupLoggerLevel(PatternLayout layout) { + layout.setConversionPattern(patternString); + + // set the default logger level + Level par; + if (logging_level.toUpperCase().equals("DEBUG")) { + par = Level.DEBUG; + } else if (logging_level.toUpperCase().equals("INFO")) { + par = Level.INFO; + } else if (logging_level.toUpperCase().equals("WARN")) { + par = Level.WARN; + } else if (logging_level.toUpperCase().equals("ERROR")) { + par = Level.ERROR; + } else if (logging_level.toUpperCase().equals("FATAL")) { + par = Level.FATAL; + } else if (logging_level.toUpperCase().equals("OFF")) { + par = Level.OFF; + } else { + // we don't understand the logging level, let's get out of here + throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (DEBUG, INFO, WARN, ERROR, FATAL, OFF)"); + } + + Logger.getRootLogger().setLevel(par); + } + + public static String getVersionNumber() { + // TODO: Confirm that version is available elsewhere not on tools. + ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); + return headerInfo.containsKey("org.broadinstitute.gatk.utils.version") ? headerInfo.getString("org.broadinstitute.gatk.utils.version") : ""; + } + + public static String getBuildTime() { + ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("GATKText"); + return headerInfo.containsKey("build.timestamp") ? headerInfo.getString("build.timestamp") : ""; + } + + /** + * a function used to indicate an error occurred in the command line tool + */ + private static void printDocumentationReference() { + errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); + errorPrintf("commonly asked questions " + HelpConstants.BASE_GATK_URL + "%n"); + } + + + /** + * Do a cursory search for the given argument. + * + * @param parser Parser + * + * @return True if help is present; false otherwise. + */ + private static boolean isHelpPresent(ParsingEngine parser) { + return parser.isArgumentPresent("help"); + } + + /** + * Print help and exit. + * + * @param clp Instance of the command-line program. + * @param parser True if help is present; false otherwise. + */ + private static void printHelpAndExit(CommandLineProgram clp, ParsingEngine parser) { + parser.printHelp(clp.getApplicationDetails()); + System.exit(0); + } + + /** + * Do a cursory search for the argument "version". + * + * @param parser Parser + * + * @return True if version is present; false otherwise. + */ + private static boolean isVersionPresent(ParsingEngine parser) { + return parser.isArgumentPresent("version"); + } + + /** + * Print help and exit. + */ + private static void printVersionAndExit() { + System.out.println(getVersionNumber().toString()); + System.exit(0); + } + + + private static void errorPrintf(String format, Object... s) { + String formatted = String.format(format, s); + + if ( formatted.trim().equals("") ) + System.err.println("##### ERROR"); + else { + for ( String part : formatted.split("\n") ) { + System.err.println("##### ERROR " + part); + } + } + } + + + /** + * used to indicate an error occured + * + * @param msg the message + * @param t the error + */ + public static void exitSystemWithError(String msg, final Throwable t) { + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("stack trace %n"); + t.printStackTrace(); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A GATK RUNTIME ERROR has occurred (version %s):%n", getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This might be a bug. Please check the documentation guide to see if this is a known problem.%n"); + errorPrintf("If not, please post the error message, with stack trace, to the GATK forum.%n"); + printDocumentationReference(); + if ( msg == null ) // some exceptions don't have detailed messages + msg = "Code exception (see stack trace for error itself)"; + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", msg.trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + public static void exitSystemWithUserError(final Exception e) { + if ( e.getMessage() == null ) + throw new ReviewedGATKException("UserException found with no message!", e); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A USER ERROR has occurred (version %s): %n", getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This means that one or more arguments or inputs in your command are incorrect.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); + errorPrintf("If the problem is an invalid argument, please check the online documentation guide%n"); + errorPrintf("(or rerun your command with --help) to view allowable command-line arguments for this tool.%n"); + errorPrintf("%n"); + printDocumentationReference(); + errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum unless you have really tried to fix it yourself.%n"); + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", e.getMessage().trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + public static void exitSystemWithSamError(final Throwable t) { + if ( t.getMessage() == null ) + throw new ReviewedGATKException("SamException found with no message!", t); + + errorPrintf("------------------------------------------------------------------------------------------%n"); + errorPrintf("A BAM ERROR has occurred (version %s): %n", getVersionNumber()); + errorPrintf("%n"); + errorPrintf("This means that there is something wrong with the BAM file(s) you provided.%n"); + errorPrintf("The error message below tells you what is the problem.%n"); + errorPrintf("%n"); + printDocumentationReference(); + errorPrintf("%n"); + errorPrintf("Please do NOT post this error to the GATK forum until you have followed these instructions:%n"); + errorPrintf("- Make sure that your BAM file is well-formed by running Picard's validator on it%n"); + errorPrintf("(see http://picard.sourceforge.net/command-line-overview.shtml#ValidateSamFile for details)%n"); + errorPrintf("- Ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n"); + errorPrintf("%n"); + errorPrintf("MESSAGE: %s%n", t.getMessage().trim()); + errorPrintf("------------------------------------------------------------------------------------------%n"); + System.exit(1); + } + + + /** + * used to indicate an error occured + * + * @param t the exception that occurred + */ + public static void exitSystemWithError(Throwable t) { + exitSystemWithError(t.getMessage(), t); + } + + /** + * A hack to ensure that numbers are always formatted in the US style. + */ + protected static void forceJVMLocaleToUSEnglish() { + Locale.setDefault(Locale.US); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/CommandLineUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/EnumerationArgumentDefault.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/EnumerationArgumentDefault.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/EnumerationArgumentDefault.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/EnumerationArgumentDefault.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Gather.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Gather.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Gather.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Gather.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Gatherer.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Gatherer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Gatherer.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Gatherer.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Hidden.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Hidden.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Hidden.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Hidden.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Input.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Input.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Input.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Input.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalArgumentCollection.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalArgumentCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalArgumentCollection.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalArgumentCollection.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalBinding.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalBinding.java new file mode 100644 index 000000000..815b02d6b --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/IntervalBinding.java @@ -0,0 +1,101 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.commandline; + +import com.google.java.contract.Requires; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import htsjdk.tribble.FeatureReader; +import org.broadinstitute.gatk.utils.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.interval.IntervalUtils; + +import java.util.*; + +/** + * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string. + * + * The IntervalBinding is a formal GATK argument that bridges between a walker and + * the engine to construct intervals for traversal at runtime. The IntervalBinding can + * either be a RodBinding, a string of one interval, or a file with interval strings. + * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. + * + * Note that this class is immutable. + */ +public final class IntervalBinding { + + private RodBinding featureIntervals; + private String stringIntervals; + + @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) + public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { + featureIntervals = new RodBinding<>(type, rawName, source, tribbleType, tags); + } + + @Requires({"intervalArgument != null"}) + public IntervalBinding(String intervalArgument) { + stringIntervals = intervalArgument; + } + + public String getSource() { + return ( featureIntervals != null ? featureIntervals.getSource() : stringIntervals ); + } + + public List getIntervals(final GenomeLocParser genomeLocParser) { + List intervals; + + if ( featureIntervals != null ) { + intervals = new ArrayList<>(); + + // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files + + final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); + if ( codec instanceof ReferenceDependentFeatureCodec ) + ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser); + try { + FeatureReader reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false); + for ( Feature feature : reader.iterator() ) + intervals.add(genomeLocParser.createGenomeLoc(feature)); + } catch (Exception e) { + throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e); + } + + } else { + intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals); + } + + Collections.sort(intervals); + return intervals; + } + + public String toString() { + return getSource(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/MissingArgumentValueException.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/MissingArgumentValueException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/MissingArgumentValueException.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/MissingArgumentValueException.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Multiplex.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Multiplex.java new file mode 100644 index 000000000..d00206b90 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Multiplex.java @@ -0,0 +1,44 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.commandline; + +import java.lang.annotation.*; + +/** + * Indicates that the class should be multiplexed according to the rules + * specified in the multiplexer. + * + * @author mhanna + * @version 0.1 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.FIELD}) +public @interface Multiplex { + public Class value(); + public String[] arguments() default {}; +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Multiplexer.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Multiplexer.java new file mode 100644 index 000000000..3a6fb2f71 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Multiplexer.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.commandline; + +import java.util.Collection; + +/** + * An interface for multiplexing output streams. + * + * @author mhanna + * @version 0.1 + */ +public interface Multiplexer { + /** + * Generate a list of the potential outputs that can be created as a function of the other + * command-line arguments in this class. + * @return A collection of unique identifiers for the file multiplex. + */ + public Collection multiplex(); + + /** + * Transform the given command-line argument into a suitable form specific to this filename. + * @param multiplexedEntry Identifies the individual component of the multiplex. Will be a value in the collection + * passed back by multiplex(). + * @param argument The actual command-line argument, supplied for transformation. + * @return A transformed representation of the command-line argument. + */ + public String transformArgument(final T multiplexedEntry, final String argument); +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Output.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Output.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Output.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Output.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedArgs.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedArgs.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedArgs.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedArgs.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedListArgs.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedListArgs.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedListArgs.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsedListArgs.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngine.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngine.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngine.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngine.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentFiles.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentFiles.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentFiles.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentFiles.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentProvider.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentProvider.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentProvider.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineArgumentProvider.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingMethod.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingMethod.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingMethod.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/ParsingMethod.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBinding.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBinding.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBinding.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBinding.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollection.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollection.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollection.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Tags.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Tags.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/Tags.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/Tags.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/package-info.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/commandline/package-info.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/commandline/package-info.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/AlignmentContext.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/AlignmentContext.java new file mode 100644 index 000000000..bbbb61778 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/AlignmentContext.java @@ -0,0 +1,154 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.contexts; + +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.HasGenomeLocation; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.List; + +/** + * Useful class for forwarding on locusContext data from this iterator + * + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:01:34 PM + * To change this template use File | Settings | File Templates. + */ +public class AlignmentContext implements HasGenomeLocation { + protected GenomeLoc loc = null; + protected ReadBackedPileup basePileup = null; + protected boolean hasPileupBeenDownsampled; + + /** + * The number of bases we've skipped over in the reference since the last map invocation. + * Only filled in by RodTraversals right now. By default, nothing is being skipped, so skippedBases == 0. + */ + private long skippedBases = 0; + + public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup) { + this(loc, basePileup, 0, false); + } + + public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, boolean hasPileupBeenDownsampled) { + this(loc, basePileup, 0, hasPileupBeenDownsampled); + } + + public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, long skippedBases) { + this(loc, basePileup, skippedBases, false); + } + + public AlignmentContext(GenomeLoc loc, ReadBackedPileup basePileup, long skippedBases,boolean hasPileupBeenDownsampled ) { + if ( loc == null ) throw new ReviewedGATKException("BUG: GenomeLoc in Alignment context is null"); + if ( basePileup == null ) throw new ReviewedGATKException("BUG: ReadBackedPileup in Alignment context is null"); + if ( skippedBases < 0 ) throw new ReviewedGATKException("BUG: skippedBases is -1 in Alignment context"); + + this.loc = loc; + this.basePileup = basePileup; + this.skippedBases = skippedBases; + this.hasPileupBeenDownsampled = hasPileupBeenDownsampled; + } + + /** Returns base pileup over the current genomic location. Deprectated. Use getBasePileup() to make your intentions + * clear. + * @return + */ + @Deprecated + public ReadBackedPileup getPileup() { return basePileup; } + + /** Returns base pileup over the current genomic location. May return null if this context keeps only + * extended event (indel) pileup. + * @return + */ + public ReadBackedPileup getBasePileup() { + return basePileup; + } + + /** + * Returns true if any reads have been filtered out of the pileup due to excess DoC. + * @return True if reads have been filtered out. False otherwise. + */ + public boolean hasPileupBeenDownsampled() { return hasPileupBeenDownsampled; } + + /** + * get all of the reads within this context + * + * @return + */ + @Deprecated + //todo: unsafe and tailored for current usage only; both pileups can be null or worse, bot can be not null in theory + public List getReads() { return ( basePileup.getReads() ); } + + /** + * Are there any reads associated with this locus? + * + * @return + */ + public boolean hasReads() { + return basePileup != null && basePileup.getNumberOfElements() > 0 ; + } + + /** + * How many reads cover this locus? + * @return + */ + public int size() { + return basePileup.getNumberOfElements(); + } + + /** + * get a list of the equivalent positions within in the reads at Pos + * + * @return + */ + @Deprecated + public List getOffsets() { + return basePileup.getOffsets(); + } + + public String getContig() { return getLocation().getContig(); } + public long getPosition() { return getLocation().getStart(); } + public GenomeLoc getLocation() { return loc; } + + public void downsampleToCoverage(int coverage) { + basePileup = basePileup.getDownsampledPileup(coverage); + hasPileupBeenDownsampled = true; + } + + /** + * Returns the number of bases we've skipped over in the reference since the last map invocation. + * Only filled in by RodTraversals right now. A value of 0 indicates that no bases were skipped. + * + * @return the number of skipped bases + */ + public long getSkippedBases() { + return skippedBases; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/AlignmentContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/AlignmentContextUtils.java new file mode 100644 index 000000000..82e1b0eb0 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/AlignmentContextUtils.java @@ -0,0 +1,150 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.contexts; + +import htsjdk.samtools.SAMReadGroupRecord; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.pileup.*; + +import java.util.*; + +/** + * Useful utilities for storing different AlignmentContexts + * User: ebanks + */ +public class AlignmentContextUtils { + + // Definitions: + // COMPLETE = full alignment context + // FORWARD = reads on forward strand + // REVERSE = reads on forward strand + // + public enum ReadOrientation { COMPLETE, FORWARD, REVERSE } + + private AlignmentContextUtils() { + // cannot be instantiated + } + + /** + * Returns a potentially derived subcontext containing only forward, reverse, or in fact all reads + * in alignment context context. + * + * @param context + * @param type + * @return + */ + public static AlignmentContext stratify(AlignmentContext context, ReadOrientation type) { + switch(type) { + case COMPLETE: + return context; + case FORWARD: + return new AlignmentContext(context.getLocation(),context.getPileup().getPositiveStrandPileup()); + case REVERSE: + return new AlignmentContext(context.getLocation(),context.getPileup().getNegativeStrandPileup()); + default: + throw new ReviewedGATKException("Unable to get alignment context for type = " + type); + } + } + + public static Map splitContextBySampleName(AlignmentContext context) { + return splitContextBySampleName(context, null); + } + + /** + * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead + * of sample object. + * + * @param context the original pileup + * + * @return a Map of sample name to StratifiedAlignmentContext + * + **/ + public static Map splitContextBySampleName(AlignmentContext context, String assumedSingleSample) { + GenomeLoc loc = context.getLocation(); + HashMap contexts = new HashMap(); + + for(String sample: context.getPileup().getSamples()) { + ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample); + + // Don't add empty pileups to the split context. + if(pileupBySample.getNumberOfElements() == 0) + continue; + + if(sample != null) + contexts.put(sample, new AlignmentContext(loc, pileupBySample)); + else { + if(assumedSingleSample == null) { + throw new UserException.ReadMissingReadGroup(pileupBySample.iterator().next().getRead()); + } + contexts.put(assumedSingleSample,new AlignmentContext(loc, pileupBySample)); + } + } + + return contexts; + } + + /** + * Splits the AlignmentContext into one context per read group + * + * @param context the original pileup + * @return a Map of ReadGroup to AlignmentContext, or an empty map if context has no base pileup + * + **/ + public static Map splitContextByReadGroup(AlignmentContext context, Collection readGroups) { + HashMap contexts = new HashMap(); + + for (SAMReadGroupRecord rg : readGroups) { + ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId()); + if ( rgPileup != null ) // there we some reads for RG + contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup)); + } + + return contexts; + } + + public static Map splitContextBySampleName(ReadBackedPileup pileup) { + return splitContextBySampleName(new AlignmentContext(pileup.getLocation(), pileup)); + } + + + public static AlignmentContext joinContexts(Collection contexts) { + // validation + GenomeLoc loc = contexts.iterator().next().getLocation(); + for(AlignmentContext context: contexts) { + if(!loc.equals(context.getLocation())) + throw new ReviewedGATKException("Illegal attempt to join contexts from different genomic locations"); + } + + List pe = new ArrayList(); + for(AlignmentContext context: contexts) { + for(PileupElement pileupElement: context.basePileup) + pe.add(pileupElement); + } + return new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe)); + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/ReferenceContext.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/ReferenceContext.java new file mode 100644 index 000000000..ae70402de --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/contexts/ReferenceContext.java @@ -0,0 +1,217 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.contexts; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +/** + * The section of the reference that overlaps with the given + * read / locus. + * + * @author hanna + * @version 0.1 + */ +public class ReferenceContext { + /** + * Facilitates creation of new GenomeLocs. + */ + final private GenomeLocParser genomeLocParser; + + /** + * The locus. + */ + final private GenomeLoc locus; + + /** + * The window of reference information around the current locus. + */ + final private GenomeLoc window; + + /** + * The bases in the window around the current locus. If null, then bases haven't been fetched yet. + * Bases are always upper cased + */ + private byte[] basesCache = null; + + /** + * Lazy loader to fetch reference bases + */ + final private ReferenceContextRefProvider basesProvider; + + /** + * Interface to create byte[] contexts for lazy loading of the reference + */ + public static interface ReferenceContextRefProvider { + /** + * You must provide a routine that gets the byte[] bases that would have been passed into the + * ReferenceContext. The RC will handling caching. The value of this interface and routine is + * that it is only called when the bytes are actually requested by the walker, not up front. So + * if the walker doesn't need the refBases for whatever reason, there's no overhead to + * provide them. + * + * @return + */ + @Ensures({"result != null"}) + public byte[] getBases(); + } + + private static class ForwardingProvider implements ReferenceContextRefProvider { + byte[] bases; + + public ForwardingProvider( byte base ) { + this(new byte[] { base }); + } + + public ForwardingProvider( byte[] bases ) { + this.bases = bases; + } + + public byte[] getBases() { return bases; } + } + + /** + * Contructor for a simple, windowless reference context. + * @param locus locus of interest. + * @param base reference base at that locus. + */ + @Requires({ + "genomeLocParser != null", + "locus != null", + "locus.size() > 0"}) + public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, byte base ) { + this( genomeLocParser, locus, locus, new ForwardingProvider(base) ); + } + + @Requires({ + "genomeLocParser != null", + "locus != null", + "locus.size() > 0", + "window != null", + "window.size() > 0", + "bases != null && bases.length > 0"}) + public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, GenomeLoc window, byte[] bases ) { + this( genomeLocParser, locus, window, new ForwardingProvider(bases) ); + } + + @Requires({ + "genomeLocParser != null", + "locus != null", + "locus.size() > 0", + "window != null", + "window.size() > 0", + "basesProvider != null"}) + public ReferenceContext( GenomeLocParser genomeLocParser, GenomeLoc locus, GenomeLoc window, ReferenceContextRefProvider basesProvider ) { + this.genomeLocParser = genomeLocParser; + this.locus = locus; + this.window = window; + this.basesProvider = basesProvider; + } + + /** + * Utility function to load bases from the provider to the cache, if necessary + */ + @Ensures({ + "basesCache != null", + "old(basesCache) == null || old(basesCache) == basesCache"}) + private void fetchBasesFromProvider() { + if ( basesCache == null ) { + basesCache = basesProvider.getBases(); + + // must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time + assert BaseUtils.isUpperCase(basesCache); + } + } + + /** + * @return The genome loc parser associated with this reference context + */ + @Ensures("result != null") + public GenomeLocParser getGenomeLocParser() { + return genomeLocParser; + } + + /** + * The locus currently being examined. + * @return The current locus. + */ + @Ensures("result != null") + public GenomeLoc getLocus() { + return locus; + } + + @Ensures("result != null") + public GenomeLoc getWindow() { + return window; + } + + /** + * Get the base at the given locus. + * @return The base at the given locus from the reference. + */ + public byte getBase() { + return getBases()[(locus.getStart() - window.getStart())]; + } + + /** + * All the bases in the window currently being examined. + * @return All bases available. If the window is of size [0,0], the array will + * contain only the base at the given locus. + */ + @Ensures({"result != null", "result.length > 0"}) + public byte[] getBases() { + fetchBasesFromProvider(); + return basesCache; + } + + /** + * All the bases in the window from the current base forward to the end of the window. + */ + @Ensures({"result != null", "result.length > 0"}) + public byte[] getForwardBases() { + final byte[] bases = getBases(); + final int mid = locus.getStart() - window.getStart(); + // todo -- warning of performance problem, especially if this is called over and over + return new String(bases).substring(mid).getBytes(); + } + + @Deprecated + public char getBaseAsChar() { + return (char)getBase(); + } + + /** + * Get the base at the given locus. + * @return The base at the given locus from the reference. + */ + @Deprecated() + public int getBaseIndex() { + return BaseUtils.simpleBaseToBaseIndex(getBase()); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/BAMDiffableReader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/BAMDiffableReader.java new file mode 100644 index 000000000..29a08cc9e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/BAMDiffableReader.java @@ -0,0 +1,119 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordIterator; +import htsjdk.samtools.ValidationStringency; +import htsjdk.samtools.util.BlockCompressedInputStream; + +import java.io.*; +import java.util.Arrays; + + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Class implementing diffnode reader for VCF + */ +public class BAMDiffableReader implements DiffableReader { + @Override + public String getName() { return "BAM"; } + + @Override + public DiffElement readFromFile(File file, int maxElementsToRead) { + final SAMFileReader reader = new SAMFileReader(file, null); // null because we don't want it to look for the index + reader.setValidationStringency(ValidationStringency.SILENT); + + DiffNode root = DiffNode.rooted(file.getName()); + SAMRecordIterator iterator = reader.iterator(); + + int count = 0; + while ( iterator.hasNext() ) { + final SAMRecord record = iterator.next(); + + // name is the read name + first of pair + String name = record.getReadName().replace('.', '_'); + if ( record.getReadPairedFlag() ) { + name += record.getFirstOfPairFlag() ? "_1" : "_2"; + } + + DiffNode readRoot = DiffNode.empty(name, root); + + // add fields + readRoot.add("NAME", record.getReadName()); + readRoot.add("FLAGS", record.getFlags()); + readRoot.add("RNAME", record.getReferenceName()); + readRoot.add("POS", record.getAlignmentStart()); + readRoot.add("MAPQ", record.getMappingQuality()); + readRoot.add("CIGAR", record.getCigarString()); + readRoot.add("RNEXT", record.getMateReferenceName()); + readRoot.add("PNEXT", record.getMateAlignmentStart()); + readRoot.add("TLEN", record.getInferredInsertSize()); + readRoot.add("SEQ", record.getReadString()); + readRoot.add("QUAL", record.getBaseQualityString()); + + for ( SAMRecord.SAMTagAndValue xt : record.getAttributes() ) { + readRoot.add(xt.tag, xt.value); + } + + // add record to root + if ( ! root.hasElement(name) ) + // protect ourselves from malformed files + root.add(readRoot); + count += readRoot.size(); + if ( count > maxElementsToRead && maxElementsToRead != -1) + break; + } + + reader.close(); + + return root.getBinding(); + } + + @Override + public boolean canRead(File file) { + final byte[] BAM_MAGIC = "BAM\1".getBytes(); + final byte[] buffer = new byte[BAM_MAGIC.length]; + try { + InputStream fstream = new BufferedInputStream(new FileInputStream(file)); + if ( !BlockCompressedInputStream.isValidFile(fstream) ) + return false; + final BlockCompressedInputStream BCIS = new BlockCompressedInputStream(fstream); + BCIS.read(buffer, 0, BAM_MAGIC.length); + BCIS.close(); + return Arrays.equals(buffer, BAM_MAGIC); + } catch ( IOException e ) { + return false; + } catch ( htsjdk.samtools.FileTruncatedException e ) { + return false; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffElement.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffElement.java new file mode 100644 index 000000000..0e0b79741 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffElement.java @@ -0,0 +1,125 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +@Invariant({ + "name != null", + "value != null", + "parent != null || name.equals(\"ROOT\")", + "value == null || value.getBinding() == this"}) +public class DiffElement { + public final static DiffElement ROOT = new DiffElement(); + + final private String name; + final private DiffElement parent; + final private DiffValue value; + + /** + * For ROOT only + */ + private DiffElement() { + this.name = "ROOT"; + this.parent = null; + this.value = new DiffValue(this, "ROOT"); + } + + @Requires({"name != null", "parent != null", "value != null"}) + public DiffElement(String name, DiffElement parent, DiffValue value) { + if ( name.equals("ROOT") ) throw new IllegalArgumentException("Cannot use reserved name ROOT"); + this.name = name; + this.parent = parent; + this.value = value; + this.value.setBinding(this); + } + + @Ensures({"result != null"}) + public String getName() { + return name; + } + + public DiffElement getParent() { + return parent; + } + + @Ensures({"result != null"}) + public DiffValue getValue() { + return value; + } + + public boolean isRoot() { return this == ROOT; } + + @Ensures({"result != null"}) + @Override + public String toString() { + return getName() + "=" + getValue().toString(); + } + + public String toString(int offset) { + return (offset > 0 ? Utils.dupString(' ', offset) : 0) + getName() + "=" + getValue().toString(offset); + } + + @Ensures({"result != null"}) + public final String fullyQualifiedName() { + if ( isRoot() ) + return ""; + else if ( parent.isRoot() ) + return name; + else + return parent.fullyQualifiedName() + "." + name; + } + + @Ensures({"result != null"}) + public String toOneLineString() { + return getName() + "=" + getValue().toOneLineString(); + } + + @Ensures({"result != null"}) + public DiffNode getValueAsNode() { + if ( getValue().isCompound() ) + return (DiffNode)getValue(); + else + throw new ReviewedGATKException("Illegal request conversion of a DiffValue into a DiffNode: " + this); + } + + public int size() { + return 1 + getValue().size(); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffEngine.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffEngine.java new file mode 100644 index 000000000..4960e6bfa --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffEngine.java @@ -0,0 +1,437 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.PrintStream; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:51 PM + * A generic engine for comparing tree-structured objects + * + */ +public class DiffEngine { + final protected static Logger logger = Logger.getLogger(DiffEngine.class); + + private final Map readers = new HashMap(); + + public DiffEngine() { + loadDiffableReaders(); + } + + // -------------------------------------------------------------------------------- + // + // difference calculation + // + // -------------------------------------------------------------------------------- + + public List diff(DiffElement master, DiffElement test) { + DiffValue masterValue = master.getValue(); + DiffValue testValue = test.getValue(); + + if ( masterValue.isCompound() && masterValue.isCompound() ) { + return diff(master.getValueAsNode(), test.getValueAsNode()); + } else if ( masterValue.isAtomic() && testValue.isAtomic() ) { + return diff(masterValue, testValue); + } else { + // structural difference in types. one is node, other is leaf + return Arrays.asList(new Difference(master, test)); + } + } + + public List diff(DiffNode master, DiffNode test) { + Set allNames = new HashSet(master.getElementNames()); + allNames.addAll(test.getElementNames()); + List diffs = new ArrayList(); + + for ( String name : allNames ) { + DiffElement masterElt = master.getElement(name); + DiffElement testElt = test.getElement(name); + if ( masterElt == null && testElt == null ) { + throw new ReviewedGATKException("BUG: unexpectedly got two null elements for field: " + name); + } else if ( masterElt == null || testElt == null ) { // if either is null, we are missing a value + // todo -- should one of these be a special MISSING item? + diffs.add(new Difference(masterElt, testElt)); + } else { + diffs.addAll(diff(masterElt, testElt)); + } + } + + return diffs; + } + + public List diff(DiffValue master, DiffValue test) { + if ( master.getValue().equals(test.getValue()) ) { + return Collections.emptyList(); + } else { + return Arrays.asList(new Difference(master.getBinding(), test.getBinding())); + } + } + + // -------------------------------------------------------------------------------- + // + // Summarizing differences + // + // -------------------------------------------------------------------------------- + + /** + * Emits a summary of the diffs to out. Suppose you have the following three differences: + * + * A.X.Z:1!=2 + * A.Y.Z:3!=4 + * B.X.Z:5!=6 + * + * The above is the itemized list of the differences. The summary looks for common differences + * in the name hierarchy, counts those shared elements, and emits the differences that occur + * in order of decreasing counts. + * + * So, in the above example, what are the shared elements? + * + * A.X.Z and B.X.Z share X.Z, so there's a *.X.Z with count 2 + * A.X.Z, A.Y.Z, and B.X.Z all share *.*.Z, with count 3 + * Each of A.X.Z, A.Y.Z, and B.X.Z are individually unique, with count 1 + * + * So we would emit the following summary: + * + * *.*.Z: 3 + * *.X.Z: 2 + * A.X.Z: 1 [specific difference: 1!=2] + * A.Y.Z: 1 [specific difference: 3!=4] + * B.X.Z: 1 [specific difference: 5!=6] + * + * The algorithm to accomplish this calculation is relatively simple. Start with all of the + * concrete differences. For each pair of differences A1.A2....AN and B1.B2....BN: + * + * find the longest common subsequence Si.Si+1...SN where Ai = Bi = Si + * If i == 0, then there's no shared substructure + * If i > 0, then generate the summarized value X = *.*...Si.Si+1...SN + * if X is a known summary, increment it's count, otherwise set its count to 1 + * + * Not that only pairs of the same length are considered as potentially equivalent + * + * @param params determines how we display the items + * @param diffs the list of differences to summarize + */ + public void reportSummarizedDifferences(List diffs, SummaryReportParams params ) { + printSummaryReport(summarizedDifferencesOfPaths(diffs, params.doPairwise, params.maxRawDiffsToSummarize), params ); + } + + final protected static String[] diffNameToPath(String diffName) { + return diffName.split("\\."); + } + + protected List summarizedDifferencesOfPathsFromString(List singletonDiffs) { + List diffs = new ArrayList(); + + for ( String diff : singletonDiffs ) { + diffs.add(new Difference(diff)); + } + + return summarizedDifferencesOfPaths(diffs, true, -1); + } + + /** + * Computes a minimum set of potential differences between all singleton differences + * in singletonDiffs. Employs an expensive pairwise O(n^2) algorithm. + * + * @param singletonDiffs + * @param maxRawDiffsToSummarize + * @return + */ + private Map initialPairwiseSummaries(final List singletonDiffs, + final int maxRawDiffsToSummarize) { + Map summaries = new HashMap(); + + // create the initial set of differences + for ( int i = 0; i < singletonDiffs.size(); i++ ) { + for ( int j = 0; j <= i; j++ ) { + Difference diffPath1 = singletonDiffs.get(i); + Difference diffPath2 = singletonDiffs.get(j); + if ( diffPath1.length() == diffPath2.length() ) { + int lcp = longestCommonPostfix(diffPath1.getParts(), diffPath2.getParts()); + String path = diffPath2.getPath(); + if ( lcp != 0 && lcp != diffPath1.length() ) + path = summarizedPath(diffPath2.getParts(), lcp); + Difference sumDiff = new Difference(path, diffPath2.getMaster(), diffPath2.getTest()); + sumDiff.setCount(0); + addSummaryIfMissing(summaries, sumDiff); + + if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) + return summaries; + } + } + } + + return summaries; + } + + /** + * Computes the possible leaf differences among the singleton diffs. + * + * The leaf differences are all of the form *.*...*.X where all internal + * differences are wildcards and the only summarized difference considered + * interesting to compute is + * + * @param singletonDiffs + * @param maxRawDiffsToSummarize + * @return + */ + private Map initialLeafSummaries(final List singletonDiffs, + final int maxRawDiffsToSummarize) { + Map summaries = new HashMap(); + + // create the initial set of differences + for ( final Difference d : singletonDiffs ) { + final String path = summarizedPath(d.getParts(), 1); + Difference sumDiff = new Difference(path, d.getMaster(), d.getTest()); + sumDiff.setCount(0); + addSummaryIfMissing(summaries, sumDiff); + + if ( maxRawDiffsToSummarize != -1 && summaries.size() > maxRawDiffsToSummarize) + return summaries; + } + + return summaries; + } + + protected List summarizedDifferencesOfPaths(final List singletonDiffs, + final boolean doPairwise, + final int maxRawDiffsToSummarize) { + final Map summaries = doPairwise + ? initialPairwiseSummaries(singletonDiffs, maxRawDiffsToSummarize) + : initialLeafSummaries(singletonDiffs, maxRawDiffsToSummarize); + + // count differences + for ( Difference diffPath : singletonDiffs ) { + for ( Difference sumDiff : summaries.values() ) { + if ( sumDiff.matches(diffPath.getParts()) ) + sumDiff.incCount(); + } + } + + List sortedSummaries = new ArrayList(summaries.values()); + Collections.sort(sortedSummaries); + return sortedSummaries; + } + + protected void addSummaryIfMissing(Map summaries, Difference diff) { + if ( ! summaries.containsKey(diff.getPath()) ) { + summaries.put(diff.getPath(), diff); + } + } + + protected void printSummaryReport(List sortedSummaries, SummaryReportParams params ) { + List toShow = new ArrayList(); + int count = 0, count1 = 0; + for ( Difference diff : sortedSummaries ) { + if ( diff.getCount() < params.minSumDiffToShow ) + // in order, so break as soon as the count is too low + break; + + if ( params.maxItemsToDisplay != 0 && count++ > params.maxItemsToDisplay ) + break; + + if ( diff.getCount() == 1 ) { + count1++; + if ( params.maxCountOneItems != 0 && count1 > params.maxCountOneItems ) + break; + } + + toShow.add(diff); + } + + // if we want it in descending order, reverse the list + if ( ! params.descending ) { + Collections.reverse(toShow); + } + + // now that we have a specific list of values we want to show, display them + GATKReport report = new GATKReport(); + final String tableName = "differences"; + report.addTable(tableName, "Summarized differences between the master and test files. See http://www.broadinstitute.org/gatk/guide/article?id=1299 for more information", 3); + final GATKReportTable table = report.getTable(tableName); + table.addColumn("Difference"); + table.addColumn("NumberOfOccurrences"); + table.addColumn("ExampleDifference"); + for ( final Difference diff : toShow ) { + final String key = diff.getPath(); + table.addRowID(key, true); + table.set(key, "NumberOfOccurrences", diff.getCount()); + table.set(key, "ExampleDifference", diff.valueDiffString()); + } + GATKReport output = new GATKReport(table); + output.print(params.out); + } + + protected static int longestCommonPostfix(String[] diffPath1, String[] diffPath2) { + int i = 0; + for ( ; i < diffPath1.length; i++ ) { + int j = diffPath1.length - i - 1; + if ( ! diffPath1[j].equals(diffPath2[j]) ) + break; + } + return i; + } + + /** + * parts is [A B C D] + * commonPostfixLength: how many parts are shared at the end, suppose its 2 + * We want to create a string *.*.C.D + * + * @param parts the separated path values [above without .] + * @param commonPostfixLength + * @return + */ + protected static String summarizedPath(String[] parts, int commonPostfixLength) { + int stop = parts.length - commonPostfixLength; + if ( stop > 0 ) parts = parts.clone(); + for ( int i = 0; i < stop; i++ ) { + parts[i] = "*"; + } + return Utils.join(".", parts); + } + + // -------------------------------------------------------------------------------- + // + // plugin manager + // + // -------------------------------------------------------------------------------- + + public void loadDiffableReaders() { + List> drClasses = new PluginManager( DiffableReader.class ).getPlugins(); + + logger.info("Loading diffable modules:"); + for (Class drClass : drClasses ) { + logger.info("\t" + drClass.getSimpleName()); + + try { + DiffableReader dr = drClass.newInstance(); + readers.put(dr.getName(), dr); + } catch (InstantiationException e) { + throw new ReviewedGATKException("Unable to instantiate module '" + drClass.getSimpleName() + "'"); + } catch (IllegalAccessException e) { + throw new ReviewedGATKException("Illegal access error when trying to instantiate '" + drClass.getSimpleName() + "'"); + } + } + } + + protected Map getReaders() { + return readers; + } + + protected DiffableReader getReader(String name) { + return readers.get(name); + } + + /** + * Returns a reader appropriate for this file, or null if no such reader exists + * @param file + * @return + */ + public DiffableReader findReaderForFile(File file) { + for ( DiffableReader reader : readers.values() ) + if (reader.canRead(file) ) + return reader; + + return null; + } + + /** + * Returns true if reader appropriate for this file, or false if no such reader exists + * @param file + * @return + */ + public boolean canRead(File file) { + return findReaderForFile(file) != null; + } + + + public DiffElement createDiffableFromFile(File file) { + return createDiffableFromFile(file, -1); + } + + public DiffElement createDiffableFromFile(File file, int maxElementsToRead) { + DiffableReader reader = findReaderForFile(file); + if ( reader == null ) + throw new UserException("Unsupported file type: " + file); + else + return reader.readFromFile(file, maxElementsToRead); + } + + public static boolean simpleDiffFiles(File masterFile, File testFile, int maxElementsToRead, DiffEngine.SummaryReportParams params) { + DiffEngine diffEngine = new DiffEngine(); + + if ( diffEngine.canRead(masterFile) && diffEngine.canRead(testFile) ) { + DiffElement master = diffEngine.createDiffableFromFile(masterFile, maxElementsToRead); + DiffElement test = diffEngine.createDiffableFromFile(testFile, maxElementsToRead); + List diffs = diffEngine.diff(master, test); + diffEngine.reportSummarizedDifferences(diffs, params); + return true; + } else { + return false; + } + } + + public static class SummaryReportParams { + final PrintStream out; + final int maxItemsToDisplay; + final int maxCountOneItems; + final int minSumDiffToShow; + final int maxRawDiffsToSummarize; + final boolean doPairwise; + boolean descending = true; + + public SummaryReportParams(PrintStream out, + int maxItemsToDisplay, + int maxCountOneItems, + int minSumDiffToShow, + int maxRawDiffsToSummarize, + final boolean doPairwise) { + this.out = out; + this.maxItemsToDisplay = maxItemsToDisplay; + this.maxCountOneItems = maxCountOneItems; + this.minSumDiffToShow = minSumDiffToShow; + this.maxRawDiffsToSummarize = maxRawDiffsToSummarize; + this.doPairwise = doPairwise; + } + + public void setDescending(boolean descending) { + this.descending = descending; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffNode.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffNode.java new file mode 100644 index 000000000..651af07ba --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffNode.java @@ -0,0 +1,249 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +import com.google.java.contract.Requires; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +public class DiffNode extends DiffValue { + private Map getElementMap() { + return (Map)super.getValue(); + } + private static Map emptyElements() { return new HashMap(); } + + private DiffNode(Map elements) { + super(elements); + } + + private DiffNode(DiffElement binding, Map elements) { + super(binding, elements); + } + + // --------------------------------------------------------------------------- + // + // constructors + // + // --------------------------------------------------------------------------- + + public static DiffNode rooted(String name) { + return empty(name, DiffElement.ROOT); + } + + public static DiffNode empty(String name, DiffElement parent) { + DiffNode df = new DiffNode(emptyElements()); + DiffElement elt = new DiffElement(name, parent, df); + df.setBinding(elt); + return df; + } + + public static DiffNode empty(String name, DiffValue parent) { + return empty(name, parent.getBinding()); + } + + // --------------------------------------------------------------------------- + // + // accessors + // + // --------------------------------------------------------------------------- + + @Override + public boolean isAtomic() { return false; } + + public Collection getElementNames() { + return getElementMap().keySet(); + } + + public Collection getElements() { + return getElementMap().values(); + } + + private Collection getElements(boolean atomicOnly) { + List elts = new ArrayList(); + for ( DiffElement elt : getElements() ) + if ( (atomicOnly && elt.getValue().isAtomic()) || (! atomicOnly && elt.getValue().isCompound())) + elts.add(elt); + return elts; + } + + public Collection getAtomicElements() { + return getElements(true); + } + + public Collection getCompoundElements() { + return getElements(false); + } + + /** + * Returns the element bound to name, or null if no such binding exists + * @param name + * @return + */ + public DiffElement getElement(String name) { + return getElementMap().get(name); + } + + /** + * Returns true if name is bound in this node + * @param name + * @return + */ + public boolean hasElement(String name) { + return getElement(name) != null; + } + + // --------------------------------------------------------------------------- + // + // add + // + // --------------------------------------------------------------------------- + + @Requires("elt != null") + public void add(DiffElement elt) { + if ( getElementMap().containsKey(elt.getName()) ) + throw new IllegalArgumentException("Attempting to rebind already existing binding: " + elt + " node=" + this); + getElementMap().put(elt.getName(), elt); + } + + @Requires("elt != null") + public void add(DiffValue elt) { + add(elt.getBinding()); + } + + @Requires("elts != null") + public void add(Collection elts) { + for ( DiffElement e : elts ) + add(e); + } + + public void add(String name, Object value) { + add(new DiffElement(name, this.getBinding(), new DiffValue(value))); + } + + public int size() { + int count = 0; + for ( DiffElement value : getElements() ) + count += value.size(); + return count; + } + + // --------------------------------------------------------------------------- + // + // toString + // + // --------------------------------------------------------------------------- + + @Override + public String toString() { + return toString(0); + } + + @Override + public String toString(int offset) { + String off = offset > 0 ? Utils.dupString(' ', offset) : ""; + StringBuilder b = new StringBuilder(); + + b.append("(").append("\n"); + Collection atomicElts = getAtomicElements(); + for ( DiffElement elt : atomicElts ) { + b.append(elt.toString(offset + 2)).append('\n'); + } + + for ( DiffElement elt : getCompoundElements() ) { + b.append(elt.toString(offset + 4)).append('\n'); + } + b.append(off).append(")").append("\n"); + + return b.toString(); + } + + @Override + public String toOneLineString() { + StringBuilder b = new StringBuilder(); + + b.append('('); + List parts = new ArrayList(); + for ( DiffElement elt : getElements() ) + parts.add(elt.toOneLineString()); + b.append(Utils.join(" ", parts)); + b.append(')'); + + return b.toString(); + } + + // -------------------------------------------------------------------------------- + // + // fromString and toOneLineString + // + // -------------------------------------------------------------------------------- + + public static DiffElement fromString(String tree) { + return fromString(tree, DiffElement.ROOT); + } + + /** + * Doesn't support full tree structure parsing + * @param tree + * @param parent + * @return + */ + private static DiffElement fromString(String tree, DiffElement parent) { + // X=(A=A B=B C=(D=D)) + String[] parts = tree.split("=", 2); + if ( parts.length != 2 ) + throw new ReviewedGATKException("Unexpected tree structure: " + tree); + String name = parts[0]; + String value = parts[1]; + + if ( value.length() == 0 ) + throw new ReviewedGATKException("Illegal tree structure: " + value + " at " + tree); + + if ( value.charAt(0) == '(' ) { + if ( ! value.endsWith(")") ) + throw new ReviewedGATKException("Illegal tree structure. Missing ): " + value + " at " + tree); + String subtree = value.substring(1, value.length()-1); + DiffNode rec = DiffNode.empty(name, parent); + String[] subParts = subtree.split(" "); + for ( String subPart : subParts ) { + rec.add(fromString(subPart, rec.getBinding())); + } + return rec.getBinding(); + } else { + return new DiffValue(name, parent, value).getBinding(); + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffValue.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffValue.java new file mode 100644 index 000000000..c84842dab --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffValue.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 12:55 PM + * + * An interface that must be implemented to allow us to calculate differences + * between structured objects + */ +public class DiffValue { + private DiffElement binding = null; + final private Object value; + + public DiffValue(Object value) { + this.value = value; + } + + public DiffValue(DiffElement binding, Object value) { + this.binding = binding; + this.value = value; + } + + public DiffValue(DiffValue parent, Object value) { + this(parent.getBinding(), value); + } + + public DiffValue(String name, DiffElement parent, Object value) { + this.binding = new DiffElement(name, parent, this); + this.value = value; + } + + public DiffValue(String name, DiffValue parent, Object value) { + this(name, parent.getBinding(), value); + } + + public DiffElement getBinding() { + return binding; + } + + protected void setBinding(DiffElement binding) { + this.binding = binding; + } + + public Object getValue() { + return value; + } + + public String toString() { + return getValue().toString(); + } + + public String toString(int offset) { + return toString(); + } + + public String toOneLineString() { + return getValue().toString(); + } + + public boolean isAtomic() { return true; } + public boolean isCompound() { return ! isAtomic(); } + public int size() { return 1; } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffableReader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffableReader.java new file mode 100644 index 000000000..43d947329 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/DiffableReader.java @@ -0,0 +1,66 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.io.File; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Interface for readers creating diffable objects from a file + */ +public interface DiffableReader { + @Ensures("result != null") + /** + * Return the name of this DiffableReader type. For example, the VCF reader returns 'VCF' and the + * bam reader 'BAM' + */ + public String getName(); + + @Ensures("result != null") + @Requires("file != null") + /** + * Read up to maxElementsToRead DiffElements from file, and return them. + */ + public DiffElement readFromFile(File file, int maxElementsToRead); + + /** + * Return true if the file can be read into DiffElement objects with this reader. This should + * be uniquely true/false for all readers, as the system will use the first reader that can read the + * file. This routine should never throw an exception. The VCF reader, for example, looks at the + * first line of the file for the ##format=VCF4.1 header, and the BAM reader for the BAM_MAGIC value + * @param file + * @return + */ + @Requires("file != null") + public boolean canRead(File file); +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/Difference.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/Difference.java new file mode 100644 index 000000000..25ebc032e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/Difference.java @@ -0,0 +1,137 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +public class Difference implements Comparable { + final String path; // X.Y.Z + final String[] parts; + int count = 1; + DiffElement master = null , test = null; + + public Difference(String path) { + this.path = path; + this.parts = DiffEngine.diffNameToPath(path); + } + + public Difference(DiffElement master, DiffElement test) { + this(createPath(master, test), master, test); + } + + public Difference(String path, DiffElement master, DiffElement test) { + this(path); + this.master = master; + this.test = test; + } + + public String[] getParts() { + return parts; + } + + public void incCount() { count++; } + + public int getCount() { + return count; + } + + public void setCount(int count) { + this.count = count; + } + + /** + * The fully qualified path object A.B.C etc + * @return + */ + public String getPath() { + return path; + } + + /** + * @return the length of the parts of this summary + */ + public int length() { + return this.parts.length; + } + + /** + * Returns true if the string parts matches this summary. Matches are + * must be equal() everywhere where this summary isn't *. + * @param otherParts + * @return + */ + public boolean matches(String[] otherParts) { + if ( otherParts.length != length() ) + return false; + + // TODO optimization: can start at right most non-star element + for ( int i = 0; i < length(); i++ ) { + String part = parts[i]; + if ( ! part.equals("*") && ! part.equals(otherParts[i]) ) + return false; + } + + return true; + } + + @Override + public String toString() { + return String.format("%s:%d:%s", getPath(), getCount(), valueDiffString()); + } + + @Override + public int compareTo(Difference other) { + // sort first highest to lowest count, then by lowest to highest path + int countCmp = Integer.valueOf(count).compareTo(other.count); + return countCmp != 0 ? -1 * countCmp : path.compareTo(other.path); + } + + public String valueDiffString() { + if ( hasSpecificDifference() ) { + return String.format("%s!=%s", getOneLineString(master), getOneLineString(test)); + } else { + return "N/A"; + } + } + + private static String createPath(DiffElement master, DiffElement test) { + return (master == null ? test : master).fullyQualifiedName(); + } + + private static String getOneLineString(DiffElement elt) { + return elt == null ? "MISSING" : elt.getValue().toOneLineString(); + } + + public boolean hasSpecificDifference() { + return master != null || test != null; + } + + public DiffElement getMaster() { + return master; + } + + public DiffElement getTest() { + return test; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/GATKReportDiffableReader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/GATKReportDiffableReader.java new file mode 100644 index 000000000..9dc5e2e5e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/GATKReportDiffableReader.java @@ -0,0 +1,104 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportColumn; +import org.broadinstitute.gatk.utils.report.GATKReportTable; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; + + +/** + * Class implementing diffnode reader for GATKReports + */ + +// TODO Version check to be added at the report level + +public class GATKReportDiffableReader implements DiffableReader { + @Override + public String getName() { + return "GATKReport"; + } + + @Override + public DiffElement readFromFile(File file, int maxElementsToRead) { + DiffNode root = DiffNode.rooted(file.getName()); + try { + // one line reads the whole thing into memory + GATKReport report = new GATKReport(file); + + for (GATKReportTable table : report.getTables()) { + root.add(tableToNode(table, root)); + } + + return root.getBinding(); + } catch (Exception e) { + return null; + } + } + + private DiffNode tableToNode(GATKReportTable table, DiffNode root) { + DiffNode tableRoot = DiffNode.empty(table.getTableName(), root); + + tableRoot.add("Description", table.getTableDescription()); + tableRoot.add("NumberOfRows", table.getNumRows()); + + for ( GATKReportColumn column : table.getColumnInfo() ) { + DiffNode columnRoot = DiffNode.empty(column.getColumnName(), tableRoot); + + columnRoot.add("Width", column.getColumnFormat().getWidth()); + // NOTE: as the values are trimmed during parsing left/right alignment is not currently preserved + columnRoot.add("Displayable", true); + + for ( int i = 0; i < table.getNumRows(); i++ ) { + String name = column.getColumnName() + (i+1); + columnRoot.add(name, table.get(i, column.getColumnName()).toString()); + } + + tableRoot.add(columnRoot); + } + + return tableRoot; + } + + @Override + public boolean canRead(File file) { + try { + final String HEADER = GATKReport.GATKREPORT_HEADER_PREFIX; + final char[] buff = new char[HEADER.length()]; + final FileReader FR = new FileReader(file); + FR.read(buff, 0, HEADER.length()); + FR.close(); + String firstLine = new String(buff); + return firstLine.startsWith(HEADER); + } catch (IOException e) { + return false; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/VCFDiffableReader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/VCFDiffableReader.java new file mode 100644 index 000000000..a60209ad3 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/diffengine/VCFDiffableReader.java @@ -0,0 +1,145 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.diffengine; + +import org.apache.log4j.Logger; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.FeatureReader; +import org.broadinstitute.gatk.utils.Utils; +import htsjdk.variant.vcf.*; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; + +import java.io.*; +import java.util.Iterator; +import java.util.Map; + + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/4/11 + * Time: 1:09 PM + * + * Class implementing diffnode reader for VCF + */ +public class VCFDiffableReader implements DiffableReader { + private static Logger logger = Logger.getLogger(VCFDiffableReader.class); + + @Override + public String getName() { return "VCF"; } + + @Override + public DiffElement readFromFile(File file, int maxElementsToRead) { + DiffNode root = DiffNode.rooted(file.getName()); + try { + // read the version line from the file + BufferedReader br = new BufferedReader(new FileReader(file)); + final String version = br.readLine(); + root.add("VERSION", version); + br.close(); + + final VCFCodec vcfCodec = new VCFCodec(); + vcfCodec.disableOnTheFlyModifications(); // must be read as state is stored in reader itself + + FeatureReader reader = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), vcfCodec, false); + VCFHeader header = (VCFHeader)reader.getHeader(); + for ( VCFHeaderLine headerLine : header.getMetaDataInInputOrder() ) { + String key = headerLine.getKey(); + if ( headerLine instanceof VCFIDHeaderLine) + key += "_" + ((VCFIDHeaderLine) headerLine).getID(); + if ( root.hasElement(key) ) + logger.warn("Skipping duplicate header line: file=" + file + " line=" + headerLine.toString()); + else + root.add(key, headerLine.toString()); + } + + int count = 0, nRecordsAtPos = 1; + String prevName = ""; + Iterator it = reader.iterator(); + while ( it.hasNext() ) { + VariantContext vc = it.next(); + String name = vc.getChr() + ":" + vc.getStart(); + if ( name.equals(prevName) ) { + name += "_" + ++nRecordsAtPos; + } else { + prevName = name; + } + DiffNode vcRoot = DiffNode.empty(name, root); + + // add fields + vcRoot.add("CHROM", vc.getChr()); + vcRoot.add("POS", vc.getStart()); + vcRoot.add("ID", vc.getID()); + vcRoot.add("REF", vc.getReference()); + vcRoot.add("ALT", vc.getAlternateAlleles()); + vcRoot.add("QUAL", vc.hasLog10PError() ? vc.getLog10PError() * -10 : VCFConstants.MISSING_VALUE_v4); + vcRoot.add("FILTER", ! vc.filtersWereApplied() // needs null to differentiate between PASS and . + ? VCFConstants.MISSING_VALUE_v4 + : ( vc.getFilters().isEmpty() ? VCFConstants.PASSES_FILTERS_v4 : vc.getFilters()) ); + + // add info fields + for (Map.Entry attribute : vc.getAttributes().entrySet()) { + if ( ! attribute.getKey().startsWith("_") ) + vcRoot.add(attribute.getKey(), attribute.getValue()); + } + + for (Genotype g : vc.getGenotypes() ) { + DiffNode gRoot = DiffNode.empty(g.getSampleName(), vcRoot); + gRoot.add("GT", g.getGenotypeString()); + if ( g.hasGQ() ) gRoot.add("GQ", g.getGQ() ); + if ( g.hasDP() ) gRoot.add("DP", g.getDP() ); + if ( g.hasAD() ) gRoot.add("AD", Utils.join(",", g.getAD())); + if ( g.hasPL() ) gRoot.add("PL", Utils.join(",", g.getPL())); + if ( g.getFilters() != null ) gRoot.add("FT", g.getFilters()); + + for (Map.Entry attribute : g.getExtendedAttributes().entrySet()) { + if ( ! attribute.getKey().startsWith("_") ) + gRoot.add(attribute.getKey(), attribute.getValue()); + } + + vcRoot.add(gRoot); + } + + root.add(vcRoot); + count += vcRoot.size(); + if ( count > maxElementsToRead && maxElementsToRead != -1) + break; + } + + reader.close(); + } catch ( IOException e ) { + return null; + } + + return root.getBinding(); + } + + @Override + public boolean canRead(File file) { + return AbstractVCFCodec.canDecodeFile(file.getPath(), VCFCodec.VCF4_MAGIC_HEADER); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/AlleleBiasedDownsamplingUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/AlleleBiasedDownsamplingUtils.java new file mode 100644 index 000000000..fd81e4ba7 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/AlleleBiasedDownsamplingUtils.java @@ -0,0 +1,369 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.collections.DefaultHashMap; +import org.broadinstitute.gatk.utils.exceptions.GATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.text.XReadLines; +import htsjdk.variant.variantcontext.Allele; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class AlleleBiasedDownsamplingUtils { + + // define this class so that we can use Java generics below + private final static class PileupElementList extends ArrayList {} + + /** + * Computes an allele biased version of the given pileup + * + * @param pileup the original pileup + * @param downsamplingFraction the fraction of total reads to remove per allele + * @return allele biased pileup + */ + public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { + // special case removal of all or no reads + if ( downsamplingFraction <= 0.0 ) + return pileup; + if ( downsamplingFraction >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + final PileupElementList[] alleleStratifiedElements = new PileupElementList[4]; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new PileupElementList(); + + // start by stratifying the reads by the alleles they represent at this position + for ( final PileupElement pe : pileup ) { + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // make a listing of allele counts and calculate the total count + final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements); + final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); + + // do smart down-sampling + final int numReadsToRemove = (int)(totalAlleleCount * downsamplingFraction); // floor + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + + final HashSet readsToRemove = new HashSet(numReadsToRemove); + for ( int i = 0; i < 4; i++ ) { + final PileupElementList alleleList = alleleStratifiedElements[i]; + // if we don't need to remove any reads, then don't + if ( alleleCounts[i] > targetAlleleCounts[i] ) + readsToRemove.addAll(downsampleElements(alleleList, alleleCounts[i], alleleCounts[i] - targetAlleleCounts[i])); + } + + // we need to keep the reads sorted because the FragmentUtils code will expect them in coordinate order and will fail otherwise + final List readsToKeep = new ArrayList(totalAlleleCount - numReadsToRemove); + for ( final PileupElement pe : pileup ) { + if ( !readsToRemove.contains(pe) ) { + readsToKeep.add(pe); + } + } + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(readsToKeep)); + } + + /** + * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) + * + * @param alleleStratifiedElements pileup elements stratified by allele + * @return non-null int array representing allele counts + */ + private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements) { + final int[] alleleCounts = new int[alleleStratifiedElements.length]; + for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { + alleleCounts[i] = alleleStratifiedElements[i].size(); + } + return alleleCounts; + } + + private static int scoreAlleleCounts(final int[] alleleCounts) { + if ( alleleCounts.length < 2 ) + return 0; + + // sort the counts (in ascending order) + final int[] alleleCountsCopy = alleleCounts.clone(); + Arrays.sort(alleleCountsCopy); + + final int maxCount = alleleCountsCopy[alleleCounts.length - 1]; + final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2]; + + int remainderCount = 0; + for ( int i = 0; i < alleleCounts.length - 2; i++ ) + remainderCount += alleleCountsCopy[i]; + + // try to get the best score: + // - in the het case the counts should be equal with nothing else + // - in the hom case the non-max should be zero + return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount)); + } + + /** + * Computes an allele biased version of the allele counts for a given pileup + * + * @param alleleCounts the allele counts for the original pileup + * @param numReadsToRemove number of total reads to remove per allele + * @return non-null array of new counts needed per allele + */ + protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { + final int numAlleles = alleleCounts.length; + + int maxScore = scoreAlleleCounts(alleleCounts); + int[] alleleCountsOfMax = alleleCounts; + + final int numReadsToRemovePerAllele = numReadsToRemove / 2; + + for ( int i = 0; i < numAlleles; i++ ) { + for ( int j = i; j < numAlleles; j++ ) { + final int[] newCounts = alleleCounts.clone(); + + // split these cases so we don't lose on the floor (since we divided by 2) + if ( i == j ) { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); + } else { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); + newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); + } + + final int score = scoreAlleleCounts(newCounts); + + if ( score < maxScore ) { + maxScore = score; + alleleCountsOfMax = newCounts; + } + } + } + + return alleleCountsOfMax; + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param elements original list of pileup elements + * @param originalElementCount original count of elements (taking reduced reads into account) + * @param numElementsToRemove the number of records to remove + * @return the list of pileup elements TO REMOVE + */ + protected static List downsampleElements(final List elements, final int originalElementCount, final int numElementsToRemove) { + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + + // should we remove all of the elements? + if ( numElementsToRemove >= originalElementCount ) { + elementsToRemove.addAll(elements); + return elementsToRemove; + } + + // create a bitset describing which elements to remove + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + int currentBitSetIndex = 0; + for ( final PileupElement element : elements ) { + if ( itemsToRemove.get(currentBitSetIndex++) ) { + elementsToRemove.add(element); + } + } + + return elementsToRemove; + } + + /** + * Computes reads to remove based on an allele biased down-sampling + * + * @param alleleReadMap original list of records per allele + * @param downsamplingFraction the fraction of total reads to remove per allele + * @return list of reads TO REMOVE from allele biased down-sampling + */ + public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction) { + int totalReads = 0; + for ( final List reads : alleleReadMap.values() ) + totalReads += reads.size(); + + int numReadsToRemove = (int)(totalReads * downsamplingFraction); + + // make a listing of allele counts + final List alleles = new ArrayList(alleleReadMap.keySet()); + alleles.remove(Allele.NO_CALL); // ignore the no-call bin + final int numAlleles = alleles.size(); + + final int[] alleleCounts = new int[numAlleles]; + for ( int i = 0; i < numAlleles; i++ ) + alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + + final List readsToRemove = new ArrayList(numReadsToRemove); + for ( int i = 0; i < numAlleles; i++ ) { + if ( alleleCounts[i] > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleElements(alleleReadMap.get(alleles.get(i)), alleleCounts[i] - targetAlleleCounts[i])); + } + } + + return readsToRemove; + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param reads original list of records + * @param numElementsToRemove the number of records to remove + * @return the list of pileup elements TO REMOVE + */ + protected static List downsampleElements(final List reads, final int numElementsToRemove) { + // are there no elements to remove? + if ( numElementsToRemove == 0 ) + return Collections.emptyList(); + + final ArrayList elementsToRemove = new ArrayList(numElementsToRemove); + final int originalElementCount = reads.size(); + + // should we remove all of the elements? + if ( numElementsToRemove >= originalElementCount ) { + elementsToRemove.addAll(reads); + return elementsToRemove; + } + + // create a bitset describing which elements to remove + final BitSet itemsToRemove = new BitSet(originalElementCount); + for ( final Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(originalElementCount, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + int currentBitSetIndex = 0; + for ( final GATKSAMRecord read : reads ) { + if ( itemsToRemove.get(currentBitSetIndex++) ) + elementsToRemove.add(read); + } + + return elementsToRemove; + } + + /** + * Create sample-contamination maps from file + * + * @param ContaminationFractionFile Filename containing two columns: SampleID and Contamination + * @param AvailableSampleIDs Set of Samples of interest (no reason to include every sample in file) or null to turn off checking + * @param logger for logging output + * @return sample-contamination Map + */ + + public static DefaultHashMap loadContaminationFile(File ContaminationFractionFile, final Double defaultContaminationFraction, final Set AvailableSampleIDs, Logger logger) throws GATKException { + DefaultHashMap sampleContamination = new DefaultHashMap(defaultContaminationFraction); + Set nonSamplesInContaminationFile = new HashSet(sampleContamination.keySet()); + try { + + XReadLines reader = new XReadLines(ContaminationFractionFile, true); + for (String line : reader) { + + if (line.length() == 0) { + continue; + } + + StringTokenizer st = new StringTokenizer(line,"\t"); + + String fields[] = new String[2]; + try { + fields[0] = st.nextToken(); + fields[1] = st.nextToken(); + } catch(NoSuchElementException e){ + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); + } + if(st.hasMoreTokens()) { + throw new UserException.MalformedFile("Contamination file must have exactly two, tab-delimited columns. Offending line:\n" + line); + } + + if (fields[0].length() == 0 || fields[1].length() == 0) { + throw new UserException.MalformedFile("Contamination file can not have empty strings in either column. Offending line:\n" + line); + } + + if (sampleContamination.containsKey(fields[0])) { + throw new UserException.MalformedFile("Contamination file contains duplicate entries for input name " + fields[0]); + } + + try { + final Double contamination = Double.valueOf(fields[1]); + if (contamination < 0 || contamination > 1){ + throw new UserException.MalformedFile("Contamination file contains unacceptable contamination value (must be 0<=x<=1): " + line); + } + if (AvailableSampleIDs==null || AvailableSampleIDs.contains(fields[0])) {// only add samples if they are in the sampleSet (or if it is null) + sampleContamination.put(fields[0], contamination); + } + else { + nonSamplesInContaminationFile.add(fields[0]); + } + } catch (NumberFormatException e) { + throw new UserException.MalformedFile("Contamination file contains unparsable double in the second field. Offending line: " + line); + } + } + + + //output to the user info lines telling which samples are in the Contamination File + if (sampleContamination.size() > 0) { + logger.info(String.format("The following samples were found in the Contamination file and will be processed at the contamination level therein: %s", sampleContamination.keySet().toString())); + + //output to the user info lines telling which samples are NOT in the Contamination File + if(AvailableSampleIDs!=null){ + Set samplesNotInContaminationFile = new HashSet(AvailableSampleIDs); + samplesNotInContaminationFile.removeAll(sampleContamination.keySet()); + if (samplesNotInContaminationFile.size() > 0) + logger.info(String.format("The following samples were NOT found in the Contamination file and will be processed at the default contamination level: %s", samplesNotInContaminationFile.toString())); + } + } + + //output to the user Samples that do not have lines in the Contamination File + if (nonSamplesInContaminationFile.size() > 0) { + logger.info(String.format("The following entries were found in the Contamination file but were not SAMPLEIDs. They will be ignored: %s", nonSamplesInContaminationFile.toString())); + } + + return sampleContamination; + + } catch (IOException e) { + throw new GATKException("I/O Error while reading sample-contamination file " + ContaminationFractionFile.getName() + ": " + e.getMessage()); + } + + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsampleType.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsampleType.java new file mode 100644 index 000000000..52ece95c1 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsampleType.java @@ -0,0 +1,39 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +/** + * Type of downsampling method to invoke. + * + * @author hanna + * @version 0.1 + */ + +public enum DownsampleType { + NONE, + ALL_READS, + BY_SAMPLE +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/Downsampler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/Downsampler.java new file mode 100644 index 000000000..cdaec016c --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/Downsampler.java @@ -0,0 +1,161 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import java.util.Collection; +import java.util.List; + +/** + * The basic downsampler API, with no reads-specific operations. + * + * Downsamplers that extend this class rather than the ReadsDownsampler class can handle + * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a + * PerSampleDownsamplingReadsIterator. + * + * @author David Roazen + */ +public abstract class Downsampler { + + /** + * Number of items discarded by this downsampler since the last call to resetStats() + */ + protected int numDiscardedItems = 0; + + /** + * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine + * immediately whether the item survives the downsampling process, while others will need to see + * more items before making that determination. + * + * @param item the individual item to submit to the downsampler for consideration + */ + public abstract void submit( final T item ); + + /** + * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling + * submit() on each individual item in the collection. + * + * @param items the collection of items to submit to the downsampler for consideration + */ + public void submit( final Collection items ) { + if ( items == null ) { + throw new IllegalArgumentException("submitted items must not be null"); + } + + for ( final T item : items ) { + submit(item); + } + } + + /** + * Are there items that have survived the downsampling process waiting to be retrieved? + * + * @return true if this downsampler has > 0 finalized items, otherwise false + */ + public abstract boolean hasFinalizedItems(); + + /** + * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. + * + * @return a list of all finalized items this downsampler contains, or an empty list if there are none + */ + public abstract List consumeFinalizedItems(); + + /** + * Are there items stored in this downsampler that it doesn't yet know whether they will + * ultimately survive the downsampling process? + * + * @return true if this downsampler has > 0 pending items, otherwise false + */ + public abstract boolean hasPendingItems(); + + /** + * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) + * + * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public abstract T peekFinalized(); + + /** + * Peek at the first pending item stored in this downsampler (or null if there are no pending items) + * + * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public abstract T peekPending(); + + /** + * Get the current number of items in this downsampler + * + * This should be the best estimate of the total number of elements that will come out of the downsampler + * were consumeFinalizedItems() to be called immediately after this call. In other words it should + * be number of finalized items + estimate of number of pending items that will ultimately be included as well. + * + * @return a positive integer + */ + public abstract int size(); + + /** + * Returns the number of items discarded (so far) during the downsampling process + * + * @return the number of items that have been submitted to this downsampler and discarded in the process of + * downsampling + */ + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + /** + * Used to tell the downsampler that no more items will be submitted to it, and that it should + * finalize any pending items. + */ + public abstract void signalEndOfInput(); + + /** + * Empty the downsampler of all finalized/pending items + */ + public abstract void clearItems(); + + /** + * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items + */ + public void resetStats() { + numDiscardedItems = 0; + } + + /** + * Indicates whether an item should be excluded from elimination during downsampling. By default, + * all items representing reduced reads are excluded from downsampling, but individual downsamplers + * may override if they are able to handle reduced reads correctly. Downsamplers should check + * the return value of this method before discarding an item. + * + * @param item The item to test + * @return true if the item should not be subject to elimination during downsampling, otherwise false + */ + protected boolean doNotDiscardItem( final Object item ) { + return false; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingMethod.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingMethod.java new file mode 100644 index 000000000..5cb32386d --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingMethod.java @@ -0,0 +1,121 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import org.broadinstitute.gatk.utils.exceptions.UserException; + +/** + * Describes the method for downsampling reads at a given locus. + */ + +public class DownsamplingMethod { + /** + * Type of downsampling to perform. + */ + public final DownsampleType type; + + /** + * Actual downsampling target is specified as an integer number of reads. + */ + public final Integer toCoverage; + + /** + * Actual downsampling target is specified as a fraction of total available reads. + */ + public final Double toFraction; + + /** + * Expresses no downsampling applied at all. + */ + public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE, null, null); + + /** + * Default type to use if no type is specified + */ + public static final DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; + + /** + * Don't allow dcov values below this threshold for locus-based traversals (ie., Locus + * and ActiveRegion walkers), as they can result in problematic downsampling artifacts + */ + public static final int MINIMUM_SAFE_COVERAGE_TARGET_FOR_LOCUS_BASED_TRAVERSALS = 200; + + + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction ) { + this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; + + if ( type == DownsampleType.NONE ) { + this.toCoverage = null; + this.toFraction = null; + } + else { + this.toCoverage = toCoverage; + this.toFraction = toFraction; + } + + validate(); + } + + private void validate() { + // Can't leave toFraction and toCoverage null unless type is NONE + if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) + throw new UserException("Must specify either toFraction or toCoverage when downsampling."); + + // Fraction and coverage cannot both be specified. + if ( toFraction != null && toCoverage != null ) + throw new UserException("Downsampling coverage and fraction are both specified. Please choose only one."); + + // toCoverage must be > 0 when specified + if ( toCoverage != null && toCoverage <= 0 ) { + throw new UserException("toCoverage must be > 0 when downsampling to coverage"); + } + + // toFraction must be >= 0.0 and <= 1.0 when specified + if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { + throw new UserException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); + } + } + + public String toString() { + StringBuilder builder = new StringBuilder("Downsampling Settings: "); + + if ( type == DownsampleType.NONE ) { + builder.append("No downsampling"); + } + else { + builder.append(String.format("Method: %s, ", type)); + + if ( toCoverage != null ) { + builder.append(String.format("Target Coverage: %d", toCoverage)); + } + else { + builder.append(String.format("Target Fraction: %.2f", toFraction)); + } + } + + return builder.toString(); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingReadsIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingReadsIterator.java new file mode 100644 index 000000000..d7106b76a --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingReadsIterator.java @@ -0,0 +1,116 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Collection; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * GATKSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style + * downsampler interface to a pull model. + * + * @author David Roazen + */ +public class DownsamplingReadsIterator implements GATKSAMIterator { + + private GATKSAMIterator nestedSAMIterator; + private ReadsDownsampler downsampler; + private Collection downsampledReadsCache; + private SAMRecord nextRead = null; + private Iterator downsampledReadsCacheIterator = null; + + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsampler downsampler through which the reads will be fed + */ + public DownsamplingReadsIterator( GATKSAMIterator iter, ReadsDownsampler downsampler ) { + nestedSAMIterator = iter; + this.downsampler = downsampler; + + advanceToNextRead(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if ( nextRead == null ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = downsampledReadsCacheIterator.next(); + } + } + + private boolean readyToReleaseReads() { + return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext(); + } + + private boolean fillDownsampledReadsCache() { + while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) { + downsampler.submit(nestedSAMIterator.next()); + } + + if ( ! nestedSAMIterator.hasNext() ) { + downsampler.signalEndOfInput(); + } + + // use returned collection directly rather than make a copy, for speed + downsampledReadsCache = downsampler.consumeFinalizedItems(); + downsampledReadsCacheIterator = downsampledReadsCache.iterator(); + + return downsampledReadsCacheIterator.hasNext(); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingUtils.java new file mode 100644 index 000000000..9bfc13a13 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/DownsamplingUtils.java @@ -0,0 +1,107 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Utilities for using the downsamplers for common tasks + * + * User: depristo + * Date: 3/6/13 + * Time: 4:26 PM + */ +public class DownsamplingUtils { + private DownsamplingUtils() { } + + /** + * Level the coverage of the reads in each sample to no more than downsampleTo reads, no reducing + * coverage at any read start to less than minReadsPerAlignmentStart + * + * This algorithm can be used to handle the situation where you have lots of coverage in some interval, and + * want to reduce the coverage of the big peak down without removing the many reads at the edge of this + * interval that are in fact good + * + * This algorithm separately operates on the reads for each sample independently. + * + * @param reads a sorted list of reads + * @param downsampleTo the targeted number of reads we want from reads per sample + * @param minReadsPerAlignmentStart don't reduce the number of reads starting at a specific alignment start + * to below this. That is, if this value is 2, we'll never reduce the number + * of reads starting at a specific start site to less than 2 + * @return a sorted list of reads + */ + public static List levelCoverageByPosition(final List reads, final int downsampleTo, final int minReadsPerAlignmentStart) { + if ( reads == null ) throw new IllegalArgumentException("reads must not be null"); + + final List downsampled = new ArrayList(reads.size()); + + final Map>> readsBySampleByStart = partitionReadsBySampleAndStart(reads); + for ( final Map> readsByPosMap : readsBySampleByStart.values() ) { + final LevelingDownsampler, GATKSAMRecord> downsampler = new LevelingDownsampler, GATKSAMRecord>(downsampleTo, minReadsPerAlignmentStart); + downsampler.submit(readsByPosMap.values()); + downsampler.signalEndOfInput(); + for ( final List downsampledReads : downsampler.consumeFinalizedItems()) + downsampled.addAll(downsampledReads); + } + + return ReadUtils.sortReadsByCoordinate(downsampled); + } + + /** + * Build the data structure mapping for each sample -> (position -> reads at position) + * + * Note that the map position -> reads isn't ordered in any meaningful way + * + * @param reads a list of sorted reads + * @return a map containing the list of reads at each start location, for each sample independently + */ + private static Map>> partitionReadsBySampleAndStart(final List reads) { + final Map>> readsBySampleByStart = new LinkedHashMap>>(); + + for ( final GATKSAMRecord read : reads ) { + Map> readsByStart = readsBySampleByStart.get(read.getReadGroup().getSample()); + + if ( readsByStart == null ) { + readsByStart = new LinkedHashMap>(); + readsBySampleByStart.put(read.getReadGroup().getSample(), readsByStart); + } + + List readsAtStart = readsByStart.get(read.getAlignmentStart()); + if ( readsAtStart == null ) { + readsAtStart = new LinkedList(); + readsByStart.put(read.getAlignmentStart(), readsAtStart); + } + + readsAtStart.add(read); + } + + return readsBySampleByStart; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/FractionalDownsampler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/FractionalDownsampler.java new file mode 100644 index 000000000..11d28c7d6 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/FractionalDownsampler.java @@ -0,0 +1,129 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.ArrayList; +import java.util.List; + +/** + * Fractional Downsampler: selects a specified fraction of the reads for inclusion. + * + * Since the selection is done randomly, the actual fraction of reads retained may be slightly + * more or less than the requested fraction, depending on the total number of reads submitted. + * + * @author David Roazen + */ +public class FractionalDownsampler extends ReadsDownsampler { + + private ArrayList selectedReads; + + private final int cutoffForInclusion; + + private static final int RANDOM_POOL_SIZE = 10000; + + /** + * Construct a FractionalDownsampler + * + * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive). + * Actual number of reads preserved may differ randomly. + */ + public FractionalDownsampler( final double fraction ) { + if ( fraction < 0.0 || fraction > 1.0 ) { + throw new ReviewedGATKException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); + } + + cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); + clearItems(); + resetStats(); + } + + @Override + public void submit( final T newRead ) { + if ( Utils.getRandomGenerator().nextInt(10000) < cutoffForInclusion || doNotDiscardItem(newRead) ) { + selectedReads.add(newRead); + } + else { + numDiscardedItems++; + } + } + + @Override + public boolean hasFinalizedItems() { + return selectedReads.size() > 0; + } + + @Override + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List downsampledItems = selectedReads; + clearItems(); + return downsampledItems; + } + + @Override + public boolean hasPendingItems() { + return false; + } + + @Override + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + @Override + public T peekPending() { + return null; + } + + @Override + public int size() { + return selectedReads.size(); + } + + @Override + public void signalEndOfInput() { + // NO-OP + } + + @Override + public void clearItems() { + selectedReads = new ArrayList(); + } + + @Override + public boolean requiresCoordinateSortOrder() { + return false; + } + + @Override + public void signalNoMoreReadsBefore( final T read ) { + // NO-OP + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/FractionalDownsamplerFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/FractionalDownsamplerFactory.java new file mode 100644 index 000000000..c2113c4a3 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/FractionalDownsamplerFactory.java @@ -0,0 +1,46 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; + +/** + * Factory for creating FractionalDownsamplers on demand + * + * @author David Roazen + */ +public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private double fraction; + + public FractionalDownsamplerFactory( double fraction ) { + this.fraction = fraction; + } + + public ReadsDownsampler newInstance() { + return new FractionalDownsampler(fraction); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/LevelingDownsampler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/LevelingDownsampler.java new file mode 100644 index 000000000..537a9f8c0 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/LevelingDownsampler.java @@ -0,0 +1,242 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import org.broadinstitute.gatk.utils.MathUtils; + +import java.util.*; + +/** + * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from + * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling + * does not occur until all Lists have been submitted and signalEndOfInput() is called. + * + * The Lists should be LinkedLists for maximum efficiency during item removal, however other + * kinds of Lists are also accepted (albeit at a slight performance penalty). + * + * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, + * the Lists need not contain reads. However this downsampler may not be wrapped within one of the + * DownsamplingReadsIterators + * + * @param the List type representing the stacks to be leveled + * @param the type of the elements of each List + * + * @author David Roazen + */ +public class LevelingDownsampler, E> extends Downsampler { + private final int minElementsPerStack; + + private final int targetSize; + + private List groups; + + private boolean groupsAreFinalized; + + /** + * Construct a LevelingDownsampler + * + * Uses the default minElementsPerStack of 1 + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + */ + public LevelingDownsampler( final int targetSize ) { + this(targetSize, 1); + } + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + * @param minElementsPerStack no stack will be reduced below this size during downsampling. That is, + * if a stack has only 3 elements and minElementsPerStack is 3, no matter what + * we'll not reduce this stack below 3. + */ + public LevelingDownsampler( final int targetSize, final int minElementsPerStack ) { + if ( targetSize < 0 ) throw new IllegalArgumentException("targetSize must be >= 0 but got " + targetSize); + if ( minElementsPerStack < 0 ) throw new IllegalArgumentException("minElementsPerStack must be >= 0 but got " + minElementsPerStack); + + this.targetSize = targetSize; + this.minElementsPerStack = minElementsPerStack; + clearItems(); + resetStats(); + } + + @Override + public void submit( final T item ) { + groups.add(item); + } + + @Override + public void submit( final Collection items ){ + groups.addAll(items); + } + + @Override + public boolean hasFinalizedItems() { + return groupsAreFinalized && groups.size() > 0; + } + + @Override + public List consumeFinalizedItems() { + if ( ! hasFinalizedItems() ) { + return new ArrayList(); + } + + // pass by reference rather than make a copy, for speed + final List toReturn = groups; + clearItems(); + return toReturn; + } + + @Override + public boolean hasPendingItems() { + return ! groupsAreFinalized && groups.size() > 0; + } + + @Override + public T peekFinalized() { + return hasFinalizedItems() ? groups.get(0) : null; + } + + @Override + public T peekPending() { + return hasPendingItems() ? groups.get(0) : null; + } + + @Override + public int size() { + int s = 0; + for ( final List l : groups ) { + s += l.size(); + } + return s; + } + + @Override + public void signalEndOfInput() { + levelGroups(); + groupsAreFinalized = true; + } + + @Override + public void clearItems() { + groups = new ArrayList(); + groupsAreFinalized = false; + } + + private void levelGroups() { + final int[] groupSizes = new int[groups.size()]; + int totalSize = 0; + int currentGroupIndex = 0; + + for ( final T group : groups ) { + groupSizes[currentGroupIndex] = group.size(); + totalSize += groupSizes[currentGroupIndex]; + currentGroupIndex++; + } + + if ( totalSize <= targetSize ) { + return; // no need to eliminate any items + } + + // We will try to remove exactly this many items, however we will refuse to allow any + // one group to fall below size 1, and so might end up removing fewer items than this + int numItemsToRemove = totalSize - targetSize; + + currentGroupIndex = 0; + int numConsecutiveUmodifiableGroups = 0; + + // Continue until we've either removed all the items we wanted to, or we can't + // remove any more items without violating the constraint that all groups must + // be left with at least one item + while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { + if ( groupSizes[currentGroupIndex] > minElementsPerStack ) { + groupSizes[currentGroupIndex]--; + numItemsToRemove--; + numConsecutiveUmodifiableGroups = 0; + } + else { + numConsecutiveUmodifiableGroups++; + } + + currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; + } + + // Now we actually go through and reduce each group to its new count as specified in groupSizes + currentGroupIndex = 0; + for ( final T group : groups ) { + downsampleOneGroup(group, groupSizes[currentGroupIndex]); + currentGroupIndex++; + } + } + + private void downsampleOneGroup( final T group, final int numItemsToKeep ) { + if ( numItemsToKeep >= group.size() ) { + return; + } + + final BitSet itemsToKeep = new BitSet(group.size()); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { + itemsToKeep.set(selectedIndex); + } + + int currentIndex = 0; + + // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator + if ( group instanceof LinkedList ) { + final Iterator iter = group.iterator(); + while ( iter.hasNext() ) { + final E item = iter.next(); + + if ( ! itemsToKeep.get(currentIndex) && ! doNotDiscardItem(item) ) { + iter.remove(); + numDiscardedItems++; + } + + currentIndex++; + } + } + // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather + // than suffer O(n^2) of item shifting + else { + final List keptItems = new ArrayList(group.size()); + + for ( final E item : group ) { + if ( itemsToKeep.get(currentIndex) || doNotDiscardItem(item) ) { + keptItems.add(item); + } + currentIndex++; + } + numDiscardedItems += group.size() - keptItems.size(); + group.clear(); + group.addAll(keptItems); + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/PassThroughDownsampler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/PassThroughDownsampler.java new file mode 100644 index 000000000..313a432d3 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/PassThroughDownsampler.java @@ -0,0 +1,111 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; + +import java.util.LinkedList; +import java.util.List; + +/** + * Pass-Through Downsampler: Implementation of the ReadsDownsampler interface that does no + * downsampling whatsoever, and instead simply "passes-through" all the reads it's given. + * Useful for situations where you want to disable downsampling, but still need to use + * the downsampler interface. + * + * @author David Roazen + */ +public class PassThroughDownsampler extends ReadsDownsampler { + + private LinkedList selectedReads; + + public PassThroughDownsampler() { + clearItems(); + } + + @Override + public void submit( T newRead ) { + // All reads pass-through, no reads get downsampled + selectedReads.add(newRead); + } + + @Override + public boolean hasFinalizedItems() { + return ! selectedReads.isEmpty(); + } + + /** + * Note that this list is a linked list and so doesn't support fast random access + * @return + */ + @Override + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + final List downsampledItems = selectedReads; + clearItems(); + return downsampledItems; + } + + @Override + public boolean hasPendingItems() { + return false; + } + + @Override + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.getFirst(); + } + + @Override + public T peekPending() { + return null; + } + + @Override + public int size() { + return selectedReads.size(); + } + + @Override + public void signalEndOfInput() { + // NO-OP + } + + @Override + public void clearItems() { + selectedReads = new LinkedList(); + } + + @Override + public boolean requiresCoordinateSortOrder() { + return false; + } + + @Override + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/PerSampleDownsamplingReadsIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/PerSampleDownsamplingReadsIterator.java new file mode 100644 index 000000000..f289960ee --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/PerSampleDownsamplingReadsIterator.java @@ -0,0 +1,207 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordComparator; +import htsjdk.samtools.SAMRecordCoordinateComparator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.*; + + +/** + * GATKSAMIterator wrapper around our generic reads downsampler interface + * that downsamples reads for each sample independently, and then re-assembles + * the reads back into a single merged stream. + * + * @author David Roazen + */ +public class PerSampleDownsamplingReadsIterator implements GATKSAMIterator { + + private GATKSAMIterator nestedSAMIterator; + private ReadsDownsamplerFactory downsamplerFactory; + private Map> perSampleDownsamplers; + private PriorityQueue orderedDownsampledReadsCache; + private SAMRecord nextRead = null; + private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); + private SAMRecord earliestPendingRead = null; + private ReadsDownsampler earliestPendingDownsampler = null; + + // Initial size of our cache of finalized reads + private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; + + // The number of positional changes that can occur in the read stream before all downsamplers + // should be informed of the current position (guards against samples with relatively sparse reads + // getting stuck in a pending state): + private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value + + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsamplerFactory factory used to create new downsamplers as needed + */ + public PerSampleDownsamplingReadsIterator( GATKSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { + nestedSAMIterator = iter; + this.downsamplerFactory = downsamplerFactory; + perSampleDownsamplers = new HashMap>(); + orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); + + advanceToNextRead(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if ( nextRead == null ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = orderedDownsampledReadsCache.poll(); + } + } + + private boolean readyToReleaseReads() { + if ( orderedDownsampledReadsCache.isEmpty() ) { + return false; + } + + return earliestPendingRead == null || + readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; + } + + private boolean fillDownsampledReadsCache() { + SAMRecord prevRead = null; + int numPositionalChanges = 0; + + // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue + // can be released without violating global sort order + while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { + SAMRecord read = nestedSAMIterator.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); + if ( thisSampleDownsampler == null ) { + thisSampleDownsampler = downsamplerFactory.newInstance(); + perSampleDownsamplers.put(sampleName, thisSampleDownsampler); + } + + thisSampleDownsampler.submit(read); + processFinalizedAndPendingItems(thisSampleDownsampler); + + if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { + numPositionalChanges++; + } + + // Periodically inform all downsamplers of the current position in the read stream. This is + // to prevent downsamplers for samples with sparser reads than others from getting stuck too + // long in a pending state. + if ( numPositionalChanges > 0 && numPositionalChanges % DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL == 0 ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalNoMoreReadsBefore(read); + processFinalizedAndPendingItems(perSampleDownsampler); + } + } + + prevRead = read; + } + + if ( ! nestedSAMIterator.hasNext() ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalEndOfInput(); + if ( perSampleDownsampler.hasFinalizedItems() ) { + orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); + } + } + earliestPendingRead = null; + earliestPendingDownsampler = null; + } + + return readyToReleaseReads(); + } + + private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { + // If there is no recorded earliest pending read and this downsampler has pending items, + // then this downsampler's first pending item becomes the new earliest pending read: + if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { + earliestPendingRead = currentDownsampler.peekPending(); + earliestPendingDownsampler = currentDownsampler; + } + // In all other cases, we only need to update the earliest pending read when the downsampler + // associated with it experiences a change in its pending reads, since by assuming a sorted + // read stream we're assured that each downsampler's earliest pending read will only increase + // in genomic position over time. + // + // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers + // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), + // TODO: but need to verify this empirically. + else if ( currentDownsampler == earliestPendingDownsampler && + (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { + + earliestPendingRead = null; + earliestPendingDownsampler = null; + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasPendingItems() && + (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { + + earliestPendingRead = perSampleDownsampler.peekPending(); + earliestPendingDownsampler = perSampleDownsampler; + } + } + } + } + + private void processFinalizedAndPendingItems( ReadsDownsampler currentDownsampler ) { + if ( currentDownsampler.hasFinalizedItems() ) { + orderedDownsampledReadsCache.addAll(currentDownsampler.consumeFinalizedItems()); + } + updateEarliestPendingRead(currentDownsampler); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReadsDownsampler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReadsDownsampler.java new file mode 100644 index 000000000..ed6b4394a --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReadsDownsampler.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; + +/** + * An extension of the basic downsampler API with reads-specific operations + * + * @author David Roazen + */ +public abstract class ReadsDownsampler extends Downsampler { + + /** + * Does this downsampler require that reads be fed to it in coordinate order? + * + * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false + */ + public abstract boolean requiresCoordinateSortOrder(); + + /** + * Tell this downsampler that no more reads located before the provided read (according to + * the sort order of the read stream) will be fed to it. + * + * Allows position-aware downsamplers to finalize pending reads earlier than they would + * otherwise be able to, particularly when doing per-sample downsampling and reads for + * certain samples are sparser than average. + * + * @param read the downsampler will assume that no reads located before this read will ever + * be submitted to it in the future + */ + public abstract void signalNoMoreReadsBefore( final T read ); +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReadsDownsamplerFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReadsDownsamplerFactory.java new file mode 100644 index 000000000..7cae97be5 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReadsDownsamplerFactory.java @@ -0,0 +1,38 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; + +/** + * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular + * downsampler, all sharing the same construction parameters. + * + * @author David Roazen + */ +public interface ReadsDownsamplerFactory { + public ReadsDownsampler newInstance(); +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReservoirDownsampler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReservoirDownsampler.java new file mode 100644 index 000000000..e4a6ee464 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReservoirDownsampler.java @@ -0,0 +1,219 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.*; + +/** + * Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with + * every read in the stream having an equal chance of being selected for inclusion. + * + * An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985) + * + * @author David Roazen + */ +public class ReservoirDownsampler extends ReadsDownsampler { + + /** + * size of our reservoir -- ie., the maximum number of reads from the stream that will be retained + * (not including any undiscardable items) + */ + private final int targetSampleSize; + + /** + * if true, this downsampler will be optimized for the case + * where most of the time we won't fill up anything like the + * targetSampleSize elements. If this is false, we will allocate + * internal buffers to targetSampleSize initially, which minimizes + * the cost of allocation if we often use targetSampleSize or more + * elements. + */ + private final boolean expectFewOverflows; + + /** + * At times this can be a linked list or an array list, depending on how we're accessing the + * data and whether or not we're expecting few overflows + */ + private List reservoir; + + /** + * Certain items (eg., reduced reads) cannot be discarded at all during downsampling. We store + * these items separately so as not to impact the fair selection of items for inclusion in the + * reservoir. These items are returned (and cleared) along with any items in the reservoir in + * calls to consumeFinalizedItems(). + */ + private List undiscardableItems; + + /** + * Are we currently using a linked list for the reservoir? + */ + private boolean isLinkedList; + + /** + * Count of the number of reads seen that were actually eligible for discarding. Used by the reservoir downsampling + * algorithm to ensure that all discardable reads have an equal chance of making it into the reservoir. + */ + private int totalDiscardableReadsSeen; + + + /** + * Construct a ReservoirDownsampler + * + * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained + * after downsampling will be min(totalDiscardableReads, targetSampleSize) + any + * undiscardable reads (eg., reduced reads). + * + * @param expectFewOverflows if true, this downsampler will be optimized for the case + * where most of the time we won't fill up anything like the + * targetSampleSize elements. If this is false, we will allocate + * internal buffers to targetSampleSize initially, which minimizes + * the cost of allocation if we often use targetSampleSize or more + * elements. + */ + public ReservoirDownsampler ( final int targetSampleSize, final boolean expectFewOverflows ) { + if ( targetSampleSize <= 0 ) { + throw new ReviewedGATKException("Cannot do reservoir downsampling with a sample size <= 0"); + } + + this.targetSampleSize = targetSampleSize; + this.expectFewOverflows = expectFewOverflows; + clearItems(); + resetStats(); + } + + /** + * Construct a ReservoirDownsampler + * + * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained + * after downsampling will be min(totalReads, targetSampleSize) + */ + public ReservoirDownsampler ( final int targetSampleSize ) { + this(targetSampleSize, false); + } + + @Override + public void submit ( final T newRead ) { + if ( doNotDiscardItem(newRead) ) { + undiscardableItems.add(newRead); + return; + } + + // Only count reads that are actually eligible for discarding for the purposes of the reservoir downsampling algorithm + totalDiscardableReadsSeen++; + + if ( totalDiscardableReadsSeen <= targetSampleSize ) { + reservoir.add(newRead); + } + else { + if ( isLinkedList ) { + reservoir = new ArrayList(reservoir); + isLinkedList = false; + } + + final int randomSlot = Utils.getRandomGenerator().nextInt(totalDiscardableReadsSeen); + if ( randomSlot < targetSampleSize ) { + reservoir.set(randomSlot, newRead); + } + numDiscardedItems++; + } + } + + @Override + public boolean hasFinalizedItems() { + return ! reservoir.isEmpty() || ! undiscardableItems.isEmpty(); + } + + @Override + public List consumeFinalizedItems() { + if ( ! hasFinalizedItems() ) { + // if there's nothing here, don't bother allocating a new list + return Collections.emptyList(); + } else { + // pass reservoir by reference rather than make a copy, for speed + final List downsampledItems = reservoir; + downsampledItems.addAll(undiscardableItems); + clearItems(); + return downsampledItems; + } + } + + @Override + public boolean hasPendingItems() { + return false; + } + + @Override + public T peekFinalized() { + return ! reservoir.isEmpty() ? reservoir.get(0) : (! undiscardableItems.isEmpty() ? undiscardableItems.get(0) : null); + } + + @Override + public T peekPending() { + return null; + } + + @Override + public int size() { + return reservoir.size() + undiscardableItems.size(); + } + + @Override + public void signalEndOfInput() { + // NO-OP + } + + /** + * Clear the data structures used to hold information + */ + @Override + public void clearItems() { + // if we aren't expecting many overflows, allocate a linked list not an arraylist + reservoir = expectFewOverflows ? new LinkedList() : new ArrayList(targetSampleSize); + + // there's no possibility of overflow with the undiscardable items, so we always use a linked list for them + undiscardableItems = new LinkedList<>(); + + // it's a linked list if we allocate one + isLinkedList = expectFewOverflows; + + // an internal stat used by the downsampling process, so not cleared by resetStats() below + totalDiscardableReadsSeen = 0; + } + + @Override + public boolean requiresCoordinateSortOrder() { + return false; + } + + @Override + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReservoirDownsamplerFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReservoirDownsamplerFactory.java new file mode 100644 index 000000000..2e6207410 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/ReservoirDownsamplerFactory.java @@ -0,0 +1,46 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; + +/** + * Factory for creating ReservoirDownsamplers on demand + * + * @author David Roazen + */ +public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetSampleSize; + + public ReservoirDownsamplerFactory( int targetSampleSize ) { + this.targetSampleSize = targetSampleSize; + } + + public ReadsDownsampler newInstance() { + return new ReservoirDownsampler(targetSampleSize); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/SimplePositionalDownsampler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/SimplePositionalDownsampler.java new file mode 100644 index 000000000..f150636f1 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/SimplePositionalDownsampler.java @@ -0,0 +1,171 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; + +import java.util.*; + +/** + * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage + * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. + * + * @author David Roazen + */ +public class SimplePositionalDownsampler extends ReadsDownsampler { + + private final int targetCoverage; + + private final ReservoirDownsampler reservoir; + + private int currentContigIndex; + + private int currentAlignmentStart; + + private boolean positionEstablished; + + private boolean unmappedReadsReached; + + private ArrayList finalizedReads; + + + /** + * Construct a SimplePositionalDownsampler + * + * @param targetCoverage Maximum number of reads that may share any given alignment start position + */ + public SimplePositionalDownsampler( final int targetCoverage ) { + this.targetCoverage = targetCoverage; + reservoir = new ReservoirDownsampler(targetCoverage); + finalizedReads = new ArrayList(); + clearItems(); + resetStats(); + } + + @Override + public void submit( final T newRead ) { + updatePositionalState(newRead); + + if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream + finalizedReads.add(newRead); + } + else { + final int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); + // our reservoir downsampler will call doNotDiscardItem() for us to exclude items from elimination as appropriate + reservoir.submit(newRead); + numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; + } + } + + @Override + public boolean hasFinalizedItems() { + return finalizedReads.size() > 0; + } + + @Override + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + final List toReturn = finalizedReads; + finalizedReads = new ArrayList(); + return toReturn; + } + + @Override + public boolean hasPendingItems() { + return reservoir.hasFinalizedItems(); + } + + @Override + public T peekFinalized() { + return finalizedReads.isEmpty() ? null : finalizedReads.get(0); + } + + @Override + public T peekPending() { + return reservoir.peekFinalized(); + } + + @Override + public int size() { + return finalizedReads.size() + reservoir.size(); + } + + @Override + public void signalEndOfInput() { + finalizeReservoir(); + } + + @Override + public void clearItems() { + reservoir.clearItems(); + reservoir.resetStats(); + finalizedReads.clear(); + positionEstablished = false; + unmappedReadsReached = false; + } + + @Override + public boolean requiresCoordinateSortOrder() { + return true; + } + + @Override + public void signalNoMoreReadsBefore( final T read ) { + updatePositionalState(read); + } + + private void updatePositionalState( final T newRead ) { + if ( readIsPastCurrentPosition(newRead) ) { + if ( reservoir.hasFinalizedItems() ) { + finalizeReservoir(); + } + + setCurrentPosition(newRead); + + if ( newRead.getReadUnmappedFlag() ) { + unmappedReadsReached = true; + } + } + } + + private void setCurrentPosition( final T read ) { + currentContigIndex = read.getReferenceIndex(); + currentAlignmentStart = read.getAlignmentStart(); + positionEstablished = true; + } + + private boolean readIsPastCurrentPosition( final T read ) { + return ! positionEstablished || + read.getReferenceIndex() > currentContigIndex || + read.getAlignmentStart() > currentAlignmentStart || + (read.getReadUnmappedFlag() && ! unmappedReadsReached); + } + + private void finalizeReservoir() { + finalizedReads.addAll(reservoir.consumeFinalizedItems()); + reservoir.resetStats(); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/SimplePositionalDownsamplerFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/SimplePositionalDownsamplerFactory.java new file mode 100644 index 000000000..cdbcf0f63 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/downsampling/SimplePositionalDownsamplerFactory.java @@ -0,0 +1,46 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.SAMRecord; + +/** + * Factory for creating SimplePositionalDownsamplers on demand + * + * @author David Roazen + */ +public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetCoverage; + + public SimplePositionalDownsamplerFactory( int targetCoverage ) { + this.targetCoverage = targetCoverage; + } + + public ReadsDownsampler newInstance() { + return new SimplePositionalDownsampler(targetCoverage); + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/duplicates/DupUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DuplicateComp.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/duplicates/DuplicateComp.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/duplicates/DuplicateComp.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/duplicates/DuplicateComp.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/exceptions/DynamicClassResolutionException.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/DynamicClassResolutionException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/exceptions/DynamicClassResolutionException.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/DynamicClassResolutionException.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/exceptions/UserException.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fasta/ArtificialFastaUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/ArtificialFastaUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fasta/ArtificialFastaUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/ArtificialFastaUtils.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java new file mode 100644 index 000000000..b2f65585a --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -0,0 +1,356 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.fasta; + +import org.broadinstitute.gatk.utils.exceptions.UserException; +import htsjdk.samtools.SAMException; +import htsjdk.samtools.reference.FastaSequenceIndex; +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.StringUtil; +import org.apache.log4j.Priority; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.BaseUtils; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Arrays; + +/** + * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. + * + * Thread-safe! Uses a thread-local cache. + * + * Automatically upper-cases the bases coming in, unless the flag preserveCase is explicitly set. + * Automatically converts IUPAC bases to Ns, unless the flag preserveIUPAC is explicitly set. + */ +public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { + protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); + + /** do we want to print debugging information about cache efficiency? */ + private static final boolean PRINT_EFFICIENCY = false; + + /** If we are printing efficiency info, what frequency should we do it at? */ + private static final int PRINT_FREQUENCY = 10000; + + /** The default cache size in bp */ + public static final long DEFAULT_CACHE_SIZE = 1000000; + + /** The cache size of this CachingIndexedFastaSequenceFile */ + private final long cacheSize; + + /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ + private final long cacheMissBackup; + + /** + * If true, we will preserve the case of the original base in the genome + */ + private final boolean preserveCase; + + /** + * If true, we will preserve the IUPAC bases in the genome + */ + private final boolean preserveIUPAC; + + // information about checking efficiency + long cacheHits = 0; + long cacheMisses = 0; + + /** Represents a specific cached sequence, with a specific start and stop, as well as the bases */ + private static class Cache { + long start = -1, stop = -1; + ReferenceSequence seq = null; + } + + /** + * Thread local cache to allow multi-threaded use of this class + */ + private ThreadLocal cache; + { + cache = new ThreadLocal () { + @Override protected Cache initialValue() { + return new Cache(); + } + }; + } + + /** + * Same as general constructor but allows one to override the default cacheSize + * + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + * @param preserveIUPAC If true, we will keep the IUPAC bases in the FASTA, otherwise they are converted to Ns + */ + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) { + super(fasta, index); + if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); + this.cacheSize = cacheSize; + this.cacheMissBackup = Math.max(cacheSize / 1000, 1); + this.preserveCase = preserveCase; + this.preserveIUPAC = preserveIUPAC; + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * Uses provided cacheSize instead of the default + * + * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + */ + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase, final boolean preserveIUPAC) throws FileNotFoundException { + super(fasta); + if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); + this.cacheSize = cacheSize; + this.cacheMissBackup = Math.max(cacheSize / 1000, 1); + this.preserveCase = preserveCase; + this.preserveIUPAC = preserveIUPAC; + } + + /** + * Same as general constructor but allows one to override the default cacheSize + * + * By default, this CachingIndexedFastaReader converts all incoming bases to upper case + * + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + */ + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + this(fasta, index, cacheSize, false, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk. + * This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood + * + * @param fasta The file to open. + */ + public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { + this(fasta, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * + * @param fasta The file to open. + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + */ + public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { + this(fasta, DEFAULT_CACHE_SIZE, preserveCase, false); + } + + /** + * Create reference data source from fasta file, after performing several preliminary checks on the file. + * This static utility was refactored from the constructor of ReferenceDataSource. + * Possibly may be better as an overloaded constructor. + * @param fastaFile Fasta file to be used as reference + * @return A new instance of a CachingIndexedFastaSequenceFile. + */ + public static CachingIndexedFastaSequenceFile checkAndCreate(final File fastaFile) { + // does the fasta file exist? check that first... + if (!fastaFile.exists()) + throw new UserException("The fasta file you specified (" + fastaFile.getAbsolutePath() + ") does not exist."); + + final boolean isGzipped = fastaFile.getAbsolutePath().endsWith(".gz"); + if ( isGzipped ) { + throw new UserException.CannotHandleGzippedRef(); + } + + final File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); + + // determine the name for the dict file + final String fastaExt = fastaFile.getAbsolutePath().endsWith("fa") ? "\\.fa$" : "\\.fasta$"; + final File dictFile = new File(fastaFile.getAbsolutePath().replaceAll(fastaExt, ".dict")); + + // It's an error if either the fai or dict file does not exist. The user is now responsible + // for creating these files. + if (!indexFile.exists()) { + throw new UserException.MissingReferenceFaiFile(indexFile, fastaFile); + } + if (!dictFile.exists()) { + throw new UserException.MissingReferenceDictFile(dictFile, fastaFile); + } + + // Read reference data by creating an IndexedFastaSequenceFile. + try { + return new CachingIndexedFastaSequenceFile(fastaFile); + } + catch (IllegalArgumentException e) { + throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read reference sequence. The FASTA must have either a .fasta or .fa extension", e); + } + catch (Exception e) { + throw new UserException.CouldNotReadInputFile(fastaFile, e); + } + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * Uses provided cacheSize instead of the default + * + * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + */ + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + this(fasta, cacheSize, false, false); + } + + /** + * Print the efficiency (hits / queries) to logger with priority + */ + public void printEfficiency(final Priority priority) { + logger.log(priority, String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%", cacheHits, cacheMisses, calcEfficiency())); + } + + /** + * Returns the efficiency (% of hits of all queries) of this object + * @return + */ + public double calcEfficiency() { + return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0); + } + + /** + * @return the number of cache hits that have occurred + */ + public long getCacheHits() { + return cacheHits; + } + + /** + * @return the number of cache misses that have occurred + */ + public long getCacheMisses() { + return cacheMisses; + } + + /** + * @return the size of the cache we are using + */ + public long getCacheSize() { + return cacheSize; + } + + /** + * Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is + * everything being made upper case? + * + * @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased + */ + public boolean isPreservingCase() { + return preserveCase; + } + + /** + * Is uppercasing bases? + * + * @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta + */ + public boolean isUppercasingBases() { + return ! isPreservingCase(); + } + + /** + * Is this CachingIndexedFastaReader keeping the IUPAC bases in the fasta, or is it turning them into Ns? + * + * @return true if the IUPAC bases coming from this reader are not modified + */ + public boolean isPreservingIUPAC() { + return preserveIUPAC; + } + + /** + * Gets the subsequence of the contig in the range [start,stop] + * + * Uses the sequence cache if possible, or updates the cache to handle the request. If the range + * is larger than the cache itself, just loads the sequence directly, not changing the cache at all + * + * @param contig Contig whose subsequence to retrieve. + * @param start inclusive, 1-based start of region. + * @param stop inclusive, 1-based stop of region. + * @return The partial reference sequence associated with this range. If preserveCase is false, then + * all of the bases in the ReferenceSequence returned by this method will be upper cased. + */ + @Override + public ReferenceSequence getSubsequenceAt( final String contig, long start, final long stop ) { + final ReferenceSequence result; + final Cache myCache = cache.get(); + + if ( (stop - start) >= cacheSize ) { + cacheMisses++; + result = super.getSubsequenceAt(contig, start, stop); + if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(result.getBases(), true, start < 1); + } else { + // todo -- potential optimization is to check if contig.name == contig, as this in general will be true + SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); + + if (stop > contigInfo.getSequenceLength()) + throw new SAMException("Query asks for data past end of contig"); + + if ( start < myCache.start || stop > myCache.stop || myCache.seq == null || myCache.seq.getContigIndex() != contigInfo.getSequenceIndex() ) { + cacheMisses++; + myCache.start = Math.max(start - cacheMissBackup, 0); + myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength()); + myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); + + // convert all of the bases in the sequence to upper case if we aren't preserving cases + if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); + if ( ! preserveIUPAC ) BaseUtils.convertIUPACtoN(myCache.seq.getBases(), true, myCache.start == 0); + } else { + cacheHits++; + } + + // at this point we determine where in the cache we want to extract the requested subsequence + final int cacheOffsetStart = (int)(start - myCache.start); + final int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); + + try { + result = new ReferenceSequence(myCache.seq.getName(), myCache.seq.getContigIndex(), Arrays.copyOfRange(myCache.seq.getBases(), cacheOffsetStart, cacheOffsetStop)); + } catch ( ArrayIndexOutOfBoundsException e ) { + throw new ReviewedGATKException(String.format("BUG: bad array indexing. Cache start %d and end %d, request start %d end %d, offset start %d and end %d, base size %d", + myCache.start, myCache.stop, start, stop, cacheOffsetStart, cacheOffsetStop, myCache.seq.getBases().length), e); + } + } + + // for debugging -- print out our efficiency if requested + if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 ) + printEfficiency(Priority.INFO); + + return result; + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fasta/package-info.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fasta/package-info.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fasta/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/file/FSLockWithShared.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/file/FSLockWithShared.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/file/FSLockWithShared.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/file/FSLockWithShared.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentCollection.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentCollection.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentCollection.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentCollection.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/fragments/FragmentUtils.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleList.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleList.java new file mode 100644 index 000000000..bf7b3ddc6 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleList.java @@ -0,0 +1,41 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.variant.variantcontext.Allele; + +/** + * Created by valentin on 5/12/14. + */ +public interface AlleleList { + + public int alleleCount(); + + public int alleleIndex(final A allele); + + public A alleleAt(final int index); + +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleListPermutation.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleListPermutation.java new file mode 100644 index 000000000..f5adb8a6e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleListPermutation.java @@ -0,0 +1,35 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.collections.Permutation; + +/** + * Marks allele list permutation implementation classes. + */ +public interface AlleleListPermutation extends Permutation, AlleleList { +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleListUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleListUtils.java new file mode 100644 index 000000000..568535ade --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/AlleleListUtils.java @@ -0,0 +1,334 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.variant.variantcontext.Allele; + +import java.util.AbstractList; +import java.util.List; + +/** + * Utils operations on {@link AlleleList} instances. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class AlleleListUtils { + + @SuppressWarnings("unchecked") + private static final AlleleList EMPTY_LIST = new AlleleList() { + @Override + public int alleleCount() { + return 0; + } + + @Override + public int alleleIndex(final Allele allele) { + return -1; + } + + @Override + public Allele alleleAt(final int index) { + throw new IllegalArgumentException("allele index is out of range"); + } + }; + + /** + * Checks whether two allele lists are in fact the same. + * @param first one list to compare. + * @param second another list to compare. + * + * @throws IllegalArgumentException if if either list is {@code null}. + * + * @return {@code true} iff both list are equal. + */ + public static boolean equals(final AlleleList first, final AlleleList second) { + if (first == null || second == null) + throw new IllegalArgumentException("no null list allowed"); + final int alleleCount = first.alleleCount(); + if (alleleCount != second.alleleCount()) + return false; + + for (int i = 0; i < alleleCount; i++) { + final A firstSample = first.alleleAt(i); + if (firstSample == null) + throw new IllegalStateException("no null samples allowed in sample-lists: first list at " + i); + final A secondSample = second.alleleAt(i); + if (secondSample == null) + throw new IllegalArgumentException("no null samples allowed in sample-list: second list at " + i); + if (!firstSample.equals(secondSample)) + return false; + } + + return true; + } + + /** + * Resolves the index of the reference allele in an allele-list. + * + *

+ * If there is no reference allele, it returns -1. If there is more than one reference allele, + * it returns the first occurrence (lowest index). + *

+ * + * @param list the search allele-list. + * @param
allele component type. + * + * @throws IllegalArgumentException if {@code list} is {@code null}. + * + * @return -1 if there is no reference allele, or a values in [0,{@code list.alleleCount()}). + */ + public static int indexOfReference(final AlleleList list) { + if (list == null) + throw new IllegalArgumentException("the input list cannot be null"); + final int alleleCount = list.alleleCount(); + for (int i = 0; i < alleleCount; i++) + if (list.alleleAt(i).isReference()) + return i; + return -1; + } + + + /** + * Returns a {@link java.util.List} unmodifiable view of a allele-list + * @param list the sample-list to wrap. + * + * @throws IllegalArgumentException if {@code list} is {@code null}. + * + * @return never {@code null}. + */ + public static List asList(final AlleleList list) { + if (list == null) + throw new IllegalArgumentException("the list cannot be null"); + return new AsList(list); + } + + /** + * Returns an unmodifiable empty allele-list. + * @param the allele class. + * @return never {@code null}. + */ + @SuppressWarnings("unchecked") + public static final AlleleList emptyList() { + return EMPTY_LIST; + } + + /** + * Simple list view of a sample-list. + */ + private static class AsList extends AbstractList { + + private final AlleleList list; + + private AsList(final AlleleList list) { + this.list = list; + + } + + @Override + public A get(int index) { + return list.alleleAt(index); + } + + @Override + public int size() { + return list.alleleCount(); + } + } + + + /** + * Returns a permutation between two allele lists. + * @param original the original allele list. + * @param target the target allele list. + * @param the allele type. + * + * @throws IllegalArgumentException if {@code original} or {@code target} is {@code null}, or + * elements in {@code target} is not contained in {@code original} + * + * @return never {@code null} + */ + public static AlleleListPermutation permutation(final AlleleList original, final AlleleList target) { + if (equals(original,target)) + return new NonPermutation<>(original); + else + return new ActualPermutation<>(original,target); + } + + private static class NonPermutation implements AlleleListPermutation { + + private final AlleleList list; + + public NonPermutation(final AlleleList original) { + list = original; + } + + @Override + public boolean isPartial() { + return false; + } + + @Override + public boolean isNonPermuted() { + return true; + } + + @Override + public int toIndex(int fromIndex) { + return fromIndex; + } + + @Override + public int fromIndex(int toIndex) { + return toIndex; + } + + @Override + public int fromSize() { + return list.alleleCount(); + } + + @Override + public int toSize() { + return list.alleleCount(); + } + + @Override + public List fromList() { + return asList(list); + } + + @Override + public java.util.List toList() { + return asList(list); + } + + + @Override + public int alleleCount() { + return list.alleleCount(); + } + + @Override + public int alleleIndex(final A allele) { + return list.alleleIndex(allele); + } + + @Override + public A alleleAt(final int index) { + return list.alleleAt(index); + } + } + + private static class ActualPermutation implements AlleleListPermutation { + + private final AlleleList from; + + private final AlleleList to; + + private final int[] fromIndex; + + private final boolean nonPermuted; + + private final boolean isPartial; + + private ActualPermutation(final AlleleList original, final AlleleList target) { + this.from = original; + this.to = target; + final int toSize = target.alleleCount(); + final int fromSize = original.alleleCount(); + if (fromSize < toSize) + throw new IllegalArgumentException("target allele list is not a permutation of the original allele list"); + + fromIndex = new int[toSize]; + boolean nonPermuted = fromSize == toSize; + this.isPartial = !nonPermuted; + for (int i = 0; i < toSize; i++) { + final int originalIndex = original.alleleIndex(target.alleleAt(i)); + if (originalIndex < 0) + throw new IllegalArgumentException("target allele list is not a permutation of the original allele list"); + fromIndex[i] = originalIndex; + nonPermuted &= originalIndex == i; + } + + this.nonPermuted = nonPermuted; + } + + @Override + public boolean isPartial() { + return isPartial; + } + + @Override + public boolean isNonPermuted() { + return nonPermuted; + } + + @Override + public int toIndex(int fromIndex) { + return to.alleleIndex(from.alleleAt(fromIndex)); + } + + @Override + public int fromIndex(int toIndex) { + return fromIndex[toIndex]; + } + + @Override + public int fromSize() { + return from.alleleCount(); + } + + @Override + public int toSize() { + return to.alleleCount(); + } + + @Override + public List fromList() { + return asList(from); + } + + @Override + public List toList() { + return asList(to); + } + + @Override + public int alleleCount() { + return to.alleleCount(); + } + + @Override + public int alleleIndex(final A allele) { + return to.alleleIndex(allele); + } + + @Override + public A alleleAt(final int index) { + return to.alleleAt(index); + } + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/DiploidGenotype.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/DiploidGenotype.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/DiploidGenotype.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/DiploidGenotype.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/IndexedAlleleList.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/IndexedAlleleList.java new file mode 100644 index 000000000..d6530238b --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/IndexedAlleleList.java @@ -0,0 +1,95 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.variant.variantcontext.Allele; +import org.broadinstitute.gatk.utils.collections.IndexedSet; + +import java.util.Collection; + +/** + * Allele list implementation using and indexed-set. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class IndexedAlleleList implements AlleleList { + + private final IndexedSet alleles; + + /** + * Constructs a new empty allele-list + */ + public IndexedAlleleList() { + alleles = new IndexedSet<>(); + } + + /** + * Constructs a new allele-list from an array of alleles. + * + *

+ * Repeats in the input array will be ignored (keeping the first one). The order of alleles in the + * resulting list is the same as in the natural traversal of the input collection. + * + *

+ * @param alleles the original allele array + * + * @throws java.lang.IllegalArgumentException if {@code alleles} is {@code null} or contains {@code null}s. + */ + public IndexedAlleleList(final A ... alleles) { + this.alleles = new IndexedSet<>(alleles); + } + + /** + * Constructs a new allele-list from a collection of alleles. + * + *

+ * Repeats in the input collection will be ignored (keeping the first one). The order of alleles in the + * resulting list is the same as in the natural traversal of the input collection. + * + *

+ * @param alleles the original allele collection + * + * @throws java.lang.IllegalArgumentException if {@code alleles} is {@code null} or contains {@code null}s. + */ + public IndexedAlleleList(final Collection
alleles) { + this.alleles = new IndexedSet<>(alleles); + } + + @Override + public int alleleCount() { + return alleles.size(); + } + + @Override + public int alleleIndex(final A allele) { + return alleles.indexOf(allele); + } + + @Override + public A alleleAt(final int index) { + return alleles.get(index); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/IndexedSampleList.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/IndexedSampleList.java new file mode 100644 index 000000000..7a92b4e24 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/IndexedSampleList.java @@ -0,0 +1,96 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import org.broadinstitute.gatk.utils.collections.IndexedSet; + +import java.util.Collection; + +/** + * Simple implementation of a sample-list using and indexed-set. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class IndexedSampleList implements SampleList { + + private final IndexedSet samples; + + /** + * Constructs an empty sample-list. + */ + public IndexedSampleList() { + samples = new IndexedSet<>(0); + } + + /** + * Constructs a sample-list from a collection of samples. + * + *

+ * Repeats in the input collection are ignored (just the first occurrence is kept). + * Sample names will be sorted based on the traversal order + * of the original collection. + *

+ * + * @param samples input sample collection. + * + * @throws IllegalArgumentException if {@code samples} is {@code null} or it contains {@code nulls}. + */ + public IndexedSampleList(final Collection samples) { + this.samples = new IndexedSet<>(samples); + } + + /** + * Constructs a sample-list from an array of samples. + * + *

+ * Repeats in the input array are ignored (just the first occurrence is kept). + * Sample names will be sorted based on the traversal order + * of the original array. + *

+ * + * @param samples input sample array. + * + * @throws IllegalArgumentException if {@code samples} is {@code null} or it contains {@code nulls}. + */ + public IndexedSampleList(final String ... samples) { + this.samples = new IndexedSet<>(samples); + } + + @Override + public int sampleCount() { + return samples.size(); + } + + @Override + public int sampleIndex(final String sample) { + return samples.indexOf(sample); + } + + @Override + public String sampleAt(int sampleIndex) { + return samples.get(sampleIndex); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/MostLikelyAllele.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/MostLikelyAllele.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/genotyper/MostLikelyAllele.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/MostLikelyAllele.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java new file mode 100644 index 000000000..56d12d026 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -0,0 +1,413 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + + +import com.google.java.contract.Ensures; +import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import htsjdk.variant.variantcontext.Allele; + +import java.util.*; + +/** + * Wrapper class that holds a set of maps of the form (Read -> Map(Allele->Double)) + * For each read, this holds underlying alleles represented by an aligned read, and corresponding relative likelihood. + */ +public class PerReadAlleleLikelihoodMap { + /** A set of all of the allele, so we can efficiently determine if an allele is already present */ + private final Map allelesSet = new HashMap<>(); + /** A list of the unique allele, as an ArrayList so we can call get(i) efficiently */ + protected final List alleles = new ArrayList<>(); + + + + protected final Map> likelihoodReadMap = new LinkedHashMap<>(); + + public PerReadAlleleLikelihoodMap() { } + + /** + * Add a new entry into the Read -> ( Allele -> Likelihood ) map of maps. + * @param read - the GATKSAMRecord that was evaluated + * @param a - the Allele against which the GATKSAMRecord was evaluated + * @param likelihood - the likelihood score resulting from the evaluation of "read" against "a" + */ + public void add(final GATKSAMRecord read, final Allele a, final Double likelihood) { + if ( read == null ) throw new IllegalArgumentException("Cannot add a null read to the allele likelihood map"); + if ( a == null ) throw new IllegalArgumentException("Cannot add a null allele to the allele likelihood map"); + if ( likelihood == null ) throw new IllegalArgumentException("Likelihood cannot be null"); + if ( likelihood > 0.0 ) throw new IllegalArgumentException("Likelihood must be negative (L = log(p))"); + + if (!allelesSet.containsKey(a)) { + allelesSet.put(a,alleles.size()); + alleles.add(a); + } + Map likelihoodMap = likelihoodReadMap.get(read); + if (likelihoodMap == null){ + // LinkedHashMap will ensure iterating through alleles will be in consistent order + likelihoodMap = new LinkedHashMap<>(); + likelihoodReadMap.put(read,likelihoodMap); + } + + likelihoodMap.put(a,likelihood); + + + } + + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction) { + return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction); + } + + /** + * For each allele "a" , identify those reads whose most likely allele is "a", and remove a "downsamplingFraction" proportion + * of those reads from the "likelihoodReadMap". This is used for e.g. sample contamination + * @param downsamplingFraction - the fraction of supporting reads to remove from each allele. If <=0 all reads kept, if >=1 all reads tossed. + */ + public void performPerAlleleDownsampling(final double downsamplingFraction) { + // special case removal of all or no reads + if ( downsamplingFraction <= 0.0 ) + return; + if ( downsamplingFraction >= 1.0 ) { + likelihoodReadMap.clear(); + return; + } + + // start by stratifying the reads by the alleles they represent at this position + final Map> alleleReadMap = getAlleleStratifiedReadMap(); + + // compute the reads to remove and actually remove them + final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction); + for ( final GATKSAMRecord read : readsToRemove ) + likelihoodReadMap.remove(read); + } + + /** + * Convert the @likelihoodReadMap to a map of alleles to reads, where each read is mapped uniquely to the allele + * for which it has the greatest associated likelihood + * @return a map from each allele to a list of reads that 'support' the allele + */ + protected Map> getAlleleStratifiedReadMap() { + final Map> alleleReadMap = new HashMap<>(alleles.size()); + for ( final Allele allele : alleles ) + alleleReadMap.put(allele, new ArrayList()); + + for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); + if ( bestAllele.isInformative() ) + alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); + } + + return alleleReadMap; + } + + @Ensures("result >=0") + public int size() { + return likelihoodReadMap.size(); + } + + /** + * Helper function to add the read underneath a pileup element to the map + * @param p Pileup element + * @param a Corresponding allele + * @param likelihood Allele likelihood + */ + public void add(PileupElement p, Allele a, Double likelihood) { + if (p==null) + throw new IllegalArgumentException("Pileup element cannot be null"); + if ( p.getRead()==null ) + throw new IllegalArgumentException("Read underlying pileup element cannot be null"); + if ( a == null ) + throw new IllegalArgumentException("Allele for add() cannot be null"); + + add(p.getRead(), a, likelihood); + } + + /** + * Does the current map contain the key associated with a particular SAM record in pileup? + * @param p Pileup element + * @return true if the map contains pileup element, else false + */ + public boolean containsPileupElement(final PileupElement p) { + return likelihoodReadMap.containsKey(p.getRead()); + } + + public boolean isEmpty() { + return likelihoodReadMap.isEmpty(); + } + + public Map> getLikelihoodReadMap() { + return likelihoodReadMap; + } + + public void clear() { + allelesSet.clear(); + alleles.clear(); + likelihoodReadMap.clear(); + } + + public Set getStoredElements() { + return likelihoodReadMap.keySet(); + } + +// public Collection> getLikelihoodMapValues() { +// return likelihoodReadMap.values(); +// } + + public int getNumberOfStoredElements() { + return likelihoodReadMap.size(); + } + + public Map getLikelihoodsAssociatedWithPileupElement(final PileupElement p) { + if (!likelihoodReadMap.containsKey(p.getRead())) + return null; + + return likelihoodReadMap.get(p.getRead()); + } + + + /** + * Get the log10 likelihood associated with an individual read/allele + * + * @param read the read whose likelihood we want + * @param allele the allele whose likelihood we want + * @return the log10 likelihood that this read matches this allele + */ + public double getLikelihoodAssociatedWithReadAndAllele(final GATKSAMRecord read, final Allele allele){ + if (!allelesSet.containsKey(allele) || !likelihoodReadMap.containsKey(read)) + return 0.0; + + return likelihoodReadMap.get(read).get(allele); + } + + /** + * Get the most likely alleles estimated across all reads in this object + * + * Takes the most likely two alleles according to their diploid genotype likelihoods. That is, for + * each allele i and j we compute p(D | i,j) where D is the read likelihoods. We track the maximum + * i,j likelihood and return an object that contains the alleles i and j as well as the max likelihood. + * + * Note that the second most likely diploid genotype is not tracked so the resulting MostLikelyAllele + * doesn't have a meaningful get best likelihood. + * + * @return a MostLikelyAllele object, or null if this map is empty + */ + public MostLikelyAllele getMostLikelyDiploidAlleles() { + if ( isEmpty() ) return null; + + int hap1 = 0; + int hap2 = 0; + double maxElement = Double.NEGATIVE_INFINITY; + for( int iii = 0; iii < alleles.size(); iii++ ) { + final Allele iii_allele = alleles.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Allele jjj_allele = alleles.get(jjj); + + double haplotypeLikelihood = 0.0; + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + final double likelihood_iii = entry.getValue().get(iii_allele); + final double likelihood_jjj = entry.getValue().get(jjj_allele); + haplotypeLikelihood += MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF; + + // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair + if ( haplotypeLikelihood < maxElement ) break; + } + + // keep track of the max element and associated indices + if ( haplotypeLikelihood > maxElement ) { + hap1 = iii; + hap2 = jjj; + maxElement = haplotypeLikelihood; + } + } + } + + if ( maxElement == Double.NEGATIVE_INFINITY ) + throw new IllegalStateException("max likelihood is " + maxElement + " indicating something has gone wrong"); + + return new MostLikelyAllele(alleles.get(hap1), alleles.get(hap2), maxElement, maxElement); + } + + /** + * Given a map from alleles to likelihoods, find the allele with the largest likelihood. + * + * @param alleleMap - a map from alleles to likelihoods + * @return - a MostLikelyAllele object + */ + @Ensures("result != null") + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap ) { + return getMostLikelyAllele(alleleMap, null); + } + + /** + * Given a map from alleles to likelihoods, find the allele with the largest likelihood. + * + * @param alleleMap - a map from alleles to likelihoods + * @param onlyConsiderTheseAlleles if not null, we will only consider alleles in this set for being one of the best. + * this is useful for the case where you've selected a subset of the alleles that + * the reads have been computed for further analysis. If null totally ignored + * @return - a MostLikelyAllele object + */ + public static MostLikelyAllele getMostLikelyAllele( final Map alleleMap, final Set onlyConsiderTheseAlleles ) { + if ( alleleMap == null ) throw new IllegalArgumentException("The allele to likelihood map cannot be null"); + double maxLike = Double.NEGATIVE_INFINITY; + double prevMaxLike = Double.NEGATIVE_INFINITY; + Allele mostLikelyAllele = Allele.NO_CALL; + Allele secondMostLikely = null; + + for (final Map.Entry el : alleleMap.entrySet()) { + if ( onlyConsiderTheseAlleles != null && ! onlyConsiderTheseAlleles.contains(el.getKey()) ) + continue; + + if (el.getValue() > maxLike) { + prevMaxLike = maxLike; + maxLike = el.getValue(); + secondMostLikely = mostLikelyAllele; + mostLikelyAllele = el.getKey(); + } else if( el.getValue() > prevMaxLike ) { + secondMostLikely = el.getKey(); + prevMaxLike = el.getValue(); + } + } + + return new MostLikelyAllele(mostLikelyAllele, secondMostLikely, maxLike, prevMaxLike); + } + + /** + * Debug method to dump contents of object into string for display + */ + public String toString() { + final StringBuilder sb = new StringBuilder(); + + sb.append("Alelles in map:"); + for (final Allele a:alleles) { + sb.append(a.getDisplayString()+","); + } + sb.append("\n"); + for (final Map.Entry > el : getLikelihoodReadMap().entrySet() ) { + for (final Map.Entry eli : el.getValue().entrySet()) { + sb.append("Read "+el.getKey().getReadName()+". Allele:"+eli.getKey().getDisplayString()+" has likelihood="+Double.toString(eli.getValue())+"\n"); + } + + } + return sb.toString(); + } + + /** + * Remove reads from this map that are poorly modelled w.r.t. their per allele likelihoods + * + * Goes through each read in this map, and if it is poorly modelled removes it from the map. + * + * @see #readIsPoorlyModelled(org.broadinstitute.gatk.utils.sam.GATKSAMRecord, java.util.Collection, double) + * for more information about the poorly modelled test. + * + * @param maxErrorRatePerBase see equivalent parameter in #readIsPoorlyModelled + * @return the list of reads removed from this map because they are poorly modelled + */ + public List filterPoorlyModelledReads(final double maxErrorRatePerBase) { + final List removedReads = new LinkedList<>(); + final Iterator>> it = likelihoodReadMap.entrySet().iterator(); + while ( it.hasNext() ) { + final Map.Entry> record = it.next(); + if ( readIsPoorlyModelled(record.getKey(), record.getValue().values(), maxErrorRatePerBase) ) { + it.remove(); + removedReads.add(record.getKey()); + } + } + + return removedReads; + } + + /** + * Is this read poorly modelled by all of the alleles in this map? + * + * A read is poorly modeled when it's likelihood is below what would be expected for a read + * originating from one of the alleles given the maxErrorRatePerBase of the reads in general. + * + * This function makes a number of key assumptions. First, that the likelihoods reflect the total likelihood + * of the read. In other words, that the read would be fully explained by one of the alleles. This means + * that the allele should be something like the full haplotype from which the read might originate. + * + * It further assumes that each error in the read occurs with likelihood of -3 (Q30 confidence per base). So + * a read with a 10% error rate with Q30 bases that's 100 bp long we'd expect to see 10 real Q30 errors + * even against the true haplotype. So for this read to be well modelled by at least one allele we'd expect + * a likelihood to be >= 10 * -3. + * + * @param read the read we want to evaluate + * @param log10Likelihoods a list of the log10 likelihoods of the read against a set of haplotypes. + * @param maxErrorRatePerBase the maximum error rate we'd expect for this read per base, in real space. So + * 0.01 means a 1% error rate + * @return true if none of the log10 likelihoods imply that the read truly originated from one of the haplotypes + */ + protected boolean readIsPoorlyModelled(final GATKSAMRecord read, final Collection log10Likelihoods, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); + final double log10QualPerBase = -4.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + for ( final double log10Likelihood : log10Likelihoods ) + if ( log10Likelihood >= log10MaxLikelihoodForTrueAllele ) + return false; + + return true; + } + + /** + * Get an unmodifiable set of the unique alleles in this PerReadAlleleLikelihoodMap + * @return a non-null unmodifiable map + */ + public Set getAllelesSet() { + return Collections.unmodifiableSet(allelesSet.keySet()); + } + + /** + * Loop over all of the reads in this likelihood map and realign them to its most likely haplotype + * @param haplotypes the collection of haplotypes + * @param paddedReferenceLoc the active region + */ + public void realignReadsToMostLikelyHaplotype(final Collection haplotypes, final GenomeLoc paddedReferenceLoc) { + + // we need to remap the Alleles back to the Haplotypes; inefficient but unfortunately this is a requirement currently + final Map alleleToHaplotypeMap = new HashMap<>(haplotypes.size()); + for ( final Haplotype haplotype : haplotypes ) + alleleToHaplotypeMap.put(Allele.create(haplotype.getBases()), haplotype); + + final Map> newLikelihoodReadMap = new LinkedHashMap<>(likelihoodReadMap.size()); + for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { + final MostLikelyAllele bestAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(entry.getValue()); + final GATKSAMRecord alignedToRef = AlignmentUtils.createReadAlignedToRef(entry.getKey(), alleleToHaplotypeMap.get(bestAllele.getMostLikelyAllele()), paddedReferenceLoc.getStart(), bestAllele.isInformative()); + newLikelihoodReadMap.put(alignedToRef, entry.getValue()); + } + + likelihoodReadMap.clear(); + likelihoodReadMap.putAll(newLikelihoodReadMap); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java new file mode 100644 index 000000000..a792386e2 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/ReadLikelihoods.java @@ -0,0 +1,1586 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import htsjdk.variant.variantcontext.Allele; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.Object2IntMap; +import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import org.broadinstitute.gatk.utils.downsampling.AlleleBiasedDownsamplingUtils; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; + +import java.util.*; + +/** + * Read-likelihoods container implementation based on integer indexed arrays. + * + * @param
the type of the allele the likelihood makes reference to. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class ReadLikelihoods implements SampleList, AlleleList, Cloneable { + + /** + * Reads by sample index. Each sub array contains reference to the reads of the ith sample. + */ + private GATKSAMRecord[][] readsBySampleIndex; + + /** + * Indexed per sample, allele and finally read (within sample). + *

+ * valuesBySampleIndex[s][a][r] == lnLk(R_r | A_a) where R_r comes from Sample s. + *

+ */ + private double[][][] valuesBySampleIndex; + + /** + * Sample list + */ + private final SampleList samples; + + /** + * Allele list + */ + private AlleleList
alleles; + + /** + * Cached allele list. + */ + private List alleleList; + + /** + * Cached sample list. + */ + private List sampleList; + + /** + * Maps from each read to its index within the sample. + * + *

In order to save CPU time the indices contained in this array (not the array itself) is + * lazily initialized by invoking {@link #readIndexBySampleIndex(int)}.

+ */ + private final Object2IntMap[] readIndexBySampleIndex; + + /** + * Index of the reference allele if any, otherwise -1 + */ + private int referenceAlleleIndex = -1; + + /** + * Caches the read-list per sample list returned by {@link #sampleReads} + */ + private final List[] readListBySampleIndex; + + /** + * Sample matrices lazily initialized (the elements not the array) by invoking {@link #sampleMatrix(int)}. + */ + private final Matrix
[] sampleMatrices; + + /** + * Constructs a new read-likelihood collection. + * + *

+ * The initial likelihoods for all allele-read combinations are + * 0. + *

+ * + * @param samples all supported samples in the collection. + * @param alleles all supported alleles in the collection. + * @param reads reads stratified per sample. + * + * @throws IllegalArgumentException if any of {@code allele}, {@code samples} + * or {@code reads} is {@code null}, + * or if they contain null values. + */ + @SuppressWarnings("unchecked") + public ReadLikelihoods(final SampleList samples, final AlleleList
alleles, + final Map> reads) { + if (alleles == null) + throw new IllegalArgumentException("allele list cannot be null"); + if (samples == null) + throw new IllegalArgumentException("sample list cannot be null"); + if (reads == null) + throw new IllegalArgumentException("read map cannot be null"); + + this.samples = samples; + this.alleles = alleles; + + final int sampleCount = samples.sampleCount(); + final int alleleCount = alleles.alleleCount(); + + readsBySampleIndex = new GATKSAMRecord[sampleCount][]; + readListBySampleIndex = new List[sampleCount]; + valuesBySampleIndex = new double[sampleCount][][]; + referenceAlleleIndex = findReferenceAllele(alleles); + + readIndexBySampleIndex = new Object2IntMap[sampleCount]; + + setupIndexes(reads, sampleCount, alleleCount); + + sampleMatrices = (Matrix[]) new Matrix[sampleCount]; + } + + // Add all the indices to alleles, sample and reads in the look-up maps. + private void setupIndexes(final Map> reads, final int sampleCount, final int alleleCount) { + for (int i = 0; i < sampleCount; i++) + setupSampleData(i, reads, alleleCount); + } + + // Assumes that {@link #samples} has been initialized with the sample names. + private void setupSampleData(final int sampleIndex, final Map> readsBySample, + final int alleleCount) { + final String sample = samples.sampleAt(sampleIndex); + + final List reads = readsBySample.get(sample); + readsBySampleIndex[sampleIndex] = reads == null + ? new GATKSAMRecord[0] + : reads.toArray(new GATKSAMRecord[reads.size()]); + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + + final double[][] sampleValues = new double[alleleCount][sampleReadCount]; + valuesBySampleIndex[sampleIndex] = sampleValues; + } + + /** + * Create an independent copy of this read-likelihoods collection + */ + public ReadLikelihoods clone() { + + final int sampleCount = samples.sampleCount(); + final int alleleCount = alleles.alleleCount(); + + final double[][][] newLikelihoodValues = new double[sampleCount][alleleCount][]; + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + for (int a = 0; a < alleleCount; a++) + newLikelihoodValues[s][a] = valuesBySampleIndex[s][a].clone(); + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(alleles, samples, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + // Internally used constructor. + @SuppressWarnings("unchecked") + private ReadLikelihoods(final AlleleList alleles, final SampleList samples, + final GATKSAMRecord[][] readsBySampleIndex, final Object2IntMap[] readIndex, + final double[][][] values) { + this.samples = samples; + this.alleles = alleles; + this.readsBySampleIndex = readsBySampleIndex; + this.valuesBySampleIndex = values; + this.readIndexBySampleIndex = readIndex; + final int sampleCount = samples.sampleCount(); + this.readListBySampleIndex = new List[sampleCount]; + + referenceAlleleIndex = findReferenceAllele(alleles); + sampleMatrices = (Matrix[]) new Matrix[sampleCount]; + } + + // Search for the reference allele, if not found the index is -1. + private int findReferenceAllele(final AlleleList alleles) { + final int alleleCount = alleles.alleleCount(); + for (int i = 0; i < alleleCount; i++) + if (alleles.alleleAt(i).isReference()) + return i; + return -1; + } + + /** + * Returns the index of a sample within the likelihood collection. + * + * @param sample the query sample. + * + * @throws IllegalArgumentException if {@code sample} is {@code null}. + * @return -1 if the allele is not included, 0 or greater otherwise. + */ + public int sampleIndex(final String sample) { + return samples.sampleIndex(sample); + } + + /** + * Number of samples included in the likelihood collection. + * @return 0 or greater. + */ + public int sampleCount() { + return samples.sampleCount(); + } + + /** + * Returns sample name given its index. + * + * @param sampleIndex query index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is negative. + * + * @return never {@code null}. + */ + public String sampleAt(final int sampleIndex) { + return samples.sampleAt(sampleIndex); + } + + /** + * Returns the index of an allele within the likelihood collection. + * + * @param allele the query allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * + * @return -1 if the allele is not included, 0 or greater otherwise. + */ + public int alleleIndex(final A allele) { + return alleles.alleleIndex(allele); + } + + /** + * Returns number of alleles in the collection. + * @return 0 or greater. + */ + @SuppressWarnings("unused") + public int alleleCount() { + return alleles.alleleCount(); + } + + /** + * Returns the allele given its index. + * + * @param alleleIndex the allele index. + * + * @throws IllegalArgumentException the allele index is {@code null}. + * + * @return never {@code null}. + */ + public A alleleAt(final int alleleIndex) { + return alleles.alleleAt(alleleIndex); + } + + /** + * Returns the reads that belong to a sample sorted by their index (within that sample). + * + * @param sampleIndex the requested sample. + * @return never {@code null} but perhaps a zero-length array if there is no reads in sample. No element in + * the array will be null. + */ + public List sampleReads(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final List extantList = readListBySampleIndex[sampleIndex]; + if (extantList == null) + return readListBySampleIndex[sampleIndex] = Collections.unmodifiableList(Arrays.asList(readsBySampleIndex[sampleIndex])); + else + return extantList; + } + + /** + * Returns a read vs allele likelihood matrix corresponding to a sample. + * + * @param sampleIndex target sample. + * + * @throws IllegalArgumentException if {@code sampleIndex} is not null. + * + * @return never {@code null} + */ + public Matrix sampleMatrix(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final Matrix extantResult = sampleMatrices[sampleIndex]; + if (extantResult != null) + return extantResult; + else + return sampleMatrices[sampleIndex] = new SampleMatrix(sampleIndex); + } + + /** + * Adjusts likelihoods so that for each read, the best allele likelihood is 0 and caps the minimum likelihood + * of any allele for each read based on the maximum alternative allele likelihood. + * + * @param bestToZero set the best likelihood to 0, others will be subtracted the same amount. + * @param maximumLikelihoodDifferenceCap maximum difference between the best alternative allele likelihood + * and any other likelihood. + * + * @throws IllegalArgumentException if {@code maximumDifferenceWithBestAlternative} is not 0 or less. + */ + public void normalizeLikelihoods(final boolean bestToZero, final double maximumLikelihoodDifferenceCap) { + if (maximumLikelihoodDifferenceCap >= 0.0 || Double.isNaN(maximumLikelihoodDifferenceCap)) + throw new IllegalArgumentException("the minimum reference likelihood fall cannot be positive"); + + if (maximumLikelihoodDifferenceCap == Double.NEGATIVE_INFINITY && !bestToZero) + return; + + final int alleleCount = alleles.alleleCount(); + if (alleleCount == 0) // trivial case there is no alleles. + return; + else if (alleleCount == 1 && !bestToZero) + return; + + for (int s = 0; s < valuesBySampleIndex.length; s++) { + final double[][] sampleValues = valuesBySampleIndex[s]; + final int readCount = readsBySampleIndex[s].length; + for (int r = 0; r < readCount; r++) + normalizeLikelihoodsPerRead(bestToZero, maximumLikelihoodDifferenceCap, sampleValues, s, r); + } + } + + // Does the normalizeLikelihoods job for each read. + private void normalizeLikelihoodsPerRead(final boolean bestToZero, final double maximumBestAltLikelihoodDifference, + final double[][] sampleValues, final int sampleIndex, final int readIndex) { + + final BestAllele bestAlternativeAllele = searchBestAllele(sampleIndex,readIndex,false); + + final double worstLikelihoodCap = bestAlternativeAllele.likelihood + maximumBestAltLikelihoodDifference; + + final double referenceLikelihood = referenceAlleleIndex == -1 ? Double.NEGATIVE_INFINITY : + sampleValues[referenceAlleleIndex][readIndex]; + + + final double bestAbsoluteLikelihood = Math.max(bestAlternativeAllele.likelihood,referenceLikelihood); + + final int alleleCount = alleles.alleleCount(); + if (bestToZero) { + if (bestAbsoluteLikelihood == Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) + sampleValues[a][readIndex] = 0; + else if (worstLikelihoodCap != Double.NEGATIVE_INFINITY) + for (int a = 0; a < alleleCount; a++) + sampleValues[a][readIndex] = (sampleValues[a][readIndex] < worstLikelihoodCap ? worstLikelihoodCap : sampleValues[a][readIndex]) - bestAbsoluteLikelihood; + else + for (int a = 0; a < alleleCount; a++) + sampleValues[a][readIndex] -= bestAbsoluteLikelihood; + } else // else if (maximumReferenceLikelihoodFall != Double.NEGATIVE_INFINITY ) { // + // Guarantee to be the case by enclosing code. + for (int a = 0; a < alleleCount; a++) + if (sampleValues[a][readIndex] < worstLikelihoodCap) + sampleValues[a][readIndex] = worstLikelihoodCap; + } + + /** + * Returns the samples in this read-likelihood collection. + *

+ * Samples are sorted by their index in the collection. + *

+ * + *

+ * The returned list is an unmodifiable view on the read-likelihoods sample list. + *

+ * + * @return never {@code null}. + */ + public List samples() { + return sampleList == null ? sampleList = SampleListUtils.asList(samples) : sampleList; + + } + + /** + * Returns the samples in this read-likelihood collection. + *

+ * Samples are sorted by their index in the collection. + *

+ * + *

+ * The returned list is an unmodifiable. It will not be updated if the collection + * allele list changes. + *

+ * + * @return never {@code null}. + */ + public List
alleles() { + return alleleList == null ? alleleList = AlleleListUtils.asList(alleles) : alleleList; + } + + + /** + * Search the best allele for a read. + * + * @param sampleIndex including sample index. + * @param readIndex target read index. + * + * @return never {@code null}, but with {@link BestAllele#allele allele} == {@code null} + * if non-could be found. + */ + private BestAllele searchBestAllele(final int sampleIndex, final int readIndex, final boolean canBeReference) { + final int alleleCount = alleles.alleleCount(); + if (alleleCount == 0 || (alleleCount == 1 && referenceAlleleIndex == 0 && !canBeReference)) + return new BestAllele(sampleIndex,readIndex,-1,Double.NEGATIVE_INFINITY,Double.NEGATIVE_INFINITY); + + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + int bestAlleleIndex = canBeReference || referenceAlleleIndex != 0 ? 0 : 1; + + double bestLikelihood = sampleValues[bestAlleleIndex][readIndex]; + double secondBestLikelihood = Double.NEGATIVE_INFINITY; + for (int a = bestAlleleIndex + 1; a < alleleCount; a++) { + if (!canBeReference && referenceAlleleIndex == a) + continue; + final double candidateLikelihood = sampleValues[a][readIndex]; + if (candidateLikelihood > bestLikelihood) { + bestAlleleIndex = a; + secondBestLikelihood = bestLikelihood; + bestLikelihood = candidateLikelihood; + } else if (candidateLikelihood > secondBestLikelihood) { + secondBestLikelihood = candidateLikelihood; + } + } + return new BestAllele(sampleIndex,readIndex,bestAlleleIndex,bestLikelihood,secondBestLikelihood); + } + + public void changeReads(final Map readRealignments) { + final int sampleCount = samples.sampleCount(); + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final Object2IntMap readIndex = readIndexBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleReads[r]; + final GATKSAMRecord replacement = readRealignments.get(read); + if (replacement == null) + continue; + sampleReads[r] = replacement; + if (readIndex != null) { + readIndex.remove(read); + readIndex.put(replacement, r); + } + } + } + } + + /** + * Add alleles that are missing in the read-likelihoods collection giving all reads a default + * likelihood value. + * @param candidateAlleles the potentially missing alleles. + * @param defaultLikelihood the default read likelihood value for that allele. + * + * @throws IllegalArgumentException if {@code candidateAlleles} is {@code null} or there is more than + * one missing allele that is a reference or there is one but the collection already has + * a reference allele. + */ + public void addMissingAlleles(final Collection candidateAlleles, final double defaultLikelihood) { + if (candidateAlleles == null) + throw new IllegalArgumentException("the candidateAlleles list cannot be null"); + if (candidateAlleles.isEmpty()) + return; + final List allelesToAdd = new ArrayList<>(candidateAlleles.size()); + for (final A allele : candidateAlleles) + if (alleles.alleleIndex(allele) == -1) + allelesToAdd.add(allele); + + if (allelesToAdd.isEmpty()) + return; + + final int oldAlleleCount = alleles.alleleCount(); + final int newAlleleCount = alleles.alleleCount() + allelesToAdd.size(); + + alleleList = null; + int referenceIndex = this.referenceAlleleIndex; + @SuppressWarnings("unchecked") + final A[] newAlleles = (A[]) new Allele[newAlleleCount]; + for (int a = 0; a < oldAlleleCount; a++) + newAlleles[a] = this.alleleAt(a); + int newIndex = oldAlleleCount; + for (final A allele : allelesToAdd) { + if (allele.isReference()) { + if (referenceIndex != -1) + throw new IllegalArgumentException("there cannot be more than one reference allele"); + referenceIndex = newIndex; + } + newAlleles[newIndex++] = allele; + } + + alleles = new IndexedAlleleList<>(newAlleles); + + if (referenceIndex != -1) + referenceAlleleIndex = referenceIndex; + + final int sampleCount = samples.sampleCount(); + for (int s = 0; s < sampleCount; s++) { + final int sampleReadCount = readsBySampleIndex[s].length; + final double[][] newValuesBySampleIndex = Arrays.copyOf(valuesBySampleIndex[s],newAlleleCount); + for (int a = oldAlleleCount; a < newAlleleCount; a++) { + newValuesBySampleIndex[a] = new double[sampleReadCount]; + if (defaultLikelihood != 0.0) + Arrays.fill(newValuesBySampleIndex[a],defaultLikelihood); + } + valuesBySampleIndex[s] = newValuesBySampleIndex; + } + } + + /** + * Likelihood matrix between a set of alleles and reads. + * @param the allele-type. + */ + public interface Matrix extends AlleleList { + + /** + * List of reads in the matrix sorted by their index therein. + * @return never {@code null}. + */ + public List reads(); + + /** + * List of alleles in the matrix sorted by their index in the collection. + * @return never {@code null}. + */ + public List alleles(); + + /** + * Set the likelihood of a read given an allele through their indices. + * + * @param alleleIndex the target allele index. + * @param readIndex the target read index. + * @param value new likelihood value for the target read give the target allele. + * + * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} + * are not valid allele and read indices respectively. + */ + public void set(final int alleleIndex, final int readIndex, final double value); + + /** + * Returns the likelihood of a read given a haplotype. + * + * @param alleleIndex the index of the given haplotype. + * @param readIndex the index of the target read. + * + * @throws IllegalArgumentException if {@code alleleIndex} or {@code readIndex} is not a + * valid allele or read index respectively. + * + * @return the requested likelihood, whatever value was provided using {@link #set(int,int,double) set} + * or 0.0 if none was set. + */ + public double get(final int alleleIndex, final int readIndex); + + /** + * Queries the index of an allele in the matrix. + * + * @param allele the target allele. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return -1 if such allele does not exist, otherwise its index which 0 or greater. + */ + @SuppressWarnings("unused") + public int alleleIndex(final A allele); + + /** + * Queries the index of a read in the matrix. + * + * @param read the target read. + * + * @throws IllegalArgumentException if {@code read} is {@code null}. + * + * @return -1 if there is not such a read in the matrix, otherwise its index + * which is 0 or greater. + */ + @SuppressWarnings("unused") + public int readIndex(final GATKSAMRecord read); + + /** + * Number of allele in the matrix. + * @return never negative. + */ + public int alleleCount(); + + /** + * Number of reads in the matrix. + * @return never negative. + */ + public int readCount(); + + /** + * Returns the allele given its index. + * + * @param alleleIndex the target allele index. + * + * @throws IllegalArgumentException if {@code alleleIndex} is not a valid allele index. + * @return never {@code null}. + */ + public A alleleAt(final int alleleIndex); + + /** + * Returns the allele given its index. + * + * @param readIndex the target allele index. + * + * @throws IllegalArgumentException if {@code readIndex} is not a valid read index. + * @return never {@code null}. + */ + public GATKSAMRecord readAt(final int readIndex); + + + /** + * Copies the likelihood of all the reads for a given allele into an array from a particular offset. + * @param alleleIndex the targeted allele + * @param dest the destination array. + * @param offset the copy offset within the destination allele + */ + public void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset); + } + + /** + * Perform marginalization from an allele set to another (smaller one) taking the maximum value + * for each read in the original allele subset. + * + * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original + * alleles that correspond to the new one. + * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and + * the same set of samples and reads as the original. + * + * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, + * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele + * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. + */ + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap) { + + if (newToOldAlleleMap == null) + throw new IllegalArgumentException("the input allele mapping cannot be null"); + + @SuppressWarnings("unchecked") + final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); + final int oldAlleleCount = alleles.alleleCount(); + final int newAlleleCount = newAlleles.length; + + // we get the index correspondence between new old -> new allele, -1 entries mean that the old + // allele does not map to any new; supported but typically not the case. + final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); + + // We calculate the marginal likelihoods. + + final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, null); + + final int sampleCount = samples.sampleCount(); + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + newReadsBySampleIndex[s] = readsBySampleIndex[s].clone(); + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(new IndexedAlleleList(newAlleles), samples, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + + /** + * Perform marginalization from an allele set to another (smaller one) taking the maximum value + * for each read in the original allele subset. + * + * @param newToOldAlleleMap map where the keys are the new alleles and the value list the original + * alleles that correspond to the new one. + * @return never {@code null}. The result will have the requested set of new alleles (keys in {@code newToOldAlleleMap}, and + * the same set of samples and reads as the original. + * + * @param overlap if not {@code null}, only reads that overlap the location (with unclipping) will be present in + * the output read-collection. + * + * @throws IllegalArgumentException is {@code newToOldAlleleMap} is {@code null} or contains {@code null} values, + * or its values contain reference to non-existing alleles in this read-likelihood collection. Also no new allele + * can have zero old alleles mapping nor two new alleles can make reference to the same old allele. + */ + public ReadLikelihoods marginalize(final Map> newToOldAlleleMap, final GenomeLoc overlap) { + + if (overlap == null) + return marginalize(newToOldAlleleMap); + + if (newToOldAlleleMap == null) + throw new IllegalArgumentException("the input allele mapping cannot be null"); + + @SuppressWarnings("unchecked") + final B[] newAlleles = newToOldAlleleMap.keySet().toArray((B[]) new Allele[newToOldAlleleMap.size()]); + final int oldAlleleCount = alleles.alleleCount(); + final int newAlleleCount = newAlleles.length; + + // we get the index correspondence between new old -> new allele, -1 entries mean that the old + // allele does not map to any new; supported but typically not the case. + final int[] oldToNewAlleleIndexMap = oldToNewAlleleIndexMap(newToOldAlleleMap, newAlleles, oldAlleleCount, newAlleleCount); + + final int[][] readsToKeep = overlappingReadIndicesBySampleIndex(overlap); + // We calculate the marginal likelihoods. + + final double[][][] newLikelihoodValues = marginalLikelihoods(oldAlleleCount, newAlleleCount, oldToNewAlleleIndexMap, readsToKeep); + + final int sampleCount = samples.sampleCount(); + + @SuppressWarnings("unchecked") + final Object2IntMap[] newReadIndexBySampleIndex = new Object2IntMap[sampleCount]; + final GATKSAMRecord[][] newReadsBySampleIndex = new GATKSAMRecord[sampleCount][]; + + for (int s = 0; s < sampleCount; s++) { + final int[] sampleReadsToKeep = readsToKeep[s]; + final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[s]; + final int oldSampleReadCount = oldSampleReads.length; + final int newSampleReadCount = sampleReadsToKeep.length; + if (newSampleReadCount == oldSampleReadCount) { + newReadsBySampleIndex[s] = oldSampleReads.clone(); + } else { + newReadsBySampleIndex[s] = new GATKSAMRecord[newSampleReadCount]; + for (int i = 0; i < newSampleReadCount; i++) + newReadsBySampleIndex[s][i] = oldSampleReads[sampleReadsToKeep[i]]; + } + } + + // Finally we create the new read-likelihood + return new ReadLikelihoods<>(new IndexedAlleleList(newAlleles), samples, + newReadsBySampleIndex, + newReadIndexBySampleIndex, newLikelihoodValues); + } + + private int[][] overlappingReadIndicesBySampleIndex(final GenomeLoc overlap) { + if (overlap == null) + return null; + final int sampleCount = samples.sampleCount(); + final int[][] result = new int[sampleCount][]; + final IntArrayList buffer = new IntArrayList(200); + final int referenceIndex = overlap.getContigIndex(); + final int overlapStart = overlap.getStart(); + final int overlapEnd = overlap.getStop(); + for (int s = 0; s < sampleCount; s++) { + buffer.clear(); + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + buffer.ensureCapacity(sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + if (unclippedReadOverlapsRegion(sampleReads[r], referenceIndex, overlapStart, overlapEnd)) + buffer.add(r); + result[s] = buffer.toIntArray(); + } + return result; + } + + public static boolean unclippedReadOverlapsRegion(final GATKSAMRecord read, final GenomeLoc region) { + return unclippedReadOverlapsRegion(read, region.getContigIndex(), region.getStart(), region.getStop()); + } + + private static boolean unclippedReadOverlapsRegion(final GATKSAMRecord sampleRead, final int referenceIndex, final int start, final int end) { + final int readReference = sampleRead.getReferenceIndex(); + if (readReference != referenceIndex) + return false; + + final int readStart = sampleRead.getUnclippedStart(); + if (readStart > end) + return false; + + final int readEnd = sampleRead.getReadUnmappedFlag() ? sampleRead.getUnclippedEnd() + : Math.max(sampleRead.getUnclippedEnd(), sampleRead.getUnclippedStart()); + return readEnd >= start; + } + + // Calculate the marginal likelihoods considering the old -> new allele index mapping. + private double[][][] marginalLikelihoods(final int oldAlleleCount, final int newAlleleCount, final int[] oldToNewAlleleIndexMap, final int[][] readsToKeep) { + + final int sampleCount = samples.sampleCount(); + final double[][][] result = new double[sampleCount][][]; + + for (int s = 0; s < sampleCount; s++) { + final int sampleReadCount = readsBySampleIndex[s].length; + final double[][] oldSampleValues = valuesBySampleIndex[s]; + final int[] sampleReadToKeep = readsToKeep == null || readsToKeep[s].length == sampleReadCount ? null : readsToKeep[s]; + final int newSampleReadCount = sampleReadToKeep == null ? sampleReadCount : sampleReadToKeep.length; + final double[][] newSampleValues = result[s] = new double[newAlleleCount][newSampleReadCount]; + // We initiate all likelihoods to -Inf. + for (int a = 0; a < newAlleleCount; a++) + Arrays.fill(newSampleValues[a], Double.NEGATIVE_INFINITY); + // For each old allele and read we update the new table keeping the maximum likelihood. + for (int r = 0; r < newSampleReadCount; r++) { + for (int a = 0; a < oldAlleleCount; a++) { + final int oldReadIndex = newSampleReadCount == sampleReadCount ? r : sampleReadToKeep[r]; + final int newAlleleIndex = oldToNewAlleleIndexMap[a]; + if (newAlleleIndex == -1) + continue; + final double likelihood = oldSampleValues[a][oldReadIndex]; + if (likelihood > newSampleValues[newAlleleIndex][r]) + newSampleValues[newAlleleIndex][r] = likelihood; + } + } + } + return result; + } + + /** + * Given a collection of likelihood in the old map format, it creates the corresponding read-likelihoods collection. + * + * @param map the likelihoods to transform. + * + * @throws IllegalArgumentException if {@code map} is {@code null}. + * + * @return never {@code null}. + */ + public static ReadLikelihoods fromPerAlleleReadLikelihoodsMap(final Map map) { + + // First we need to create the read-likelihood collection with all required alleles, samples and reads. + final SampleList sampleList = new IndexedSampleList(map.keySet()); + final Set alleles = new LinkedHashSet<>(10); + final Map> sampleToReads = new HashMap<>(sampleList.sampleCount()); + for (final Map.Entry entry : map.entrySet()) { + final String sample = entry.getKey(); + final PerReadAlleleLikelihoodMap sampleLikelihoods = entry.getValue(); + alleles.addAll(sampleLikelihoods.getAllelesSet()); + sampleToReads.put(sample,new ArrayList<>(sampleLikelihoods.getLikelihoodReadMap().keySet())); + } + + final AlleleList alleleList = new IndexedAlleleList<>(alleles); + final ReadLikelihoods result = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); + + // Now set the likelihoods. + for (final Map.Entry sampleEntry : map.entrySet()) { + final ReadLikelihoods.Matrix sampleMatrix = result.sampleMatrix(result.sampleIndex(sampleEntry.getKey())); + for (final Map.Entry> readEntry : sampleEntry.getValue().getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = readEntry.getKey(); + final int readIndex = sampleMatrix.readIndex(read); + for (final Map.Entry alleleEntry : readEntry.getValue().entrySet()) { + final int alleleIndex = result.alleleIndex(alleleEntry.getKey()); + sampleMatrix.set(alleleIndex,readIndex,alleleEntry.getValue()); + } + } + } + return result; + } + + // calculates an old to new allele index map array. + private int[] oldToNewAlleleIndexMap(final Map> newToOldAlleleMap, final B[] newAlleles, + final int oldAlleleCount, final int newAlleleCount) { + + final int[] oldToNewAlleleIndexMap = new int[oldAlleleCount]; + Arrays.fill(oldToNewAlleleIndexMap, -1); // -1 indicate that there is no new allele that make reference to that old one. + + for (int i = 0; i < newAlleleCount; i++) { + final B newAllele = newAlleles[i]; + if (newAllele == null) + throw new IllegalArgumentException("input alleles cannot be null"); + final List oldAlleles = newToOldAlleleMap.get(newAllele); + if (oldAlleles == null) + throw new IllegalArgumentException("no new allele list can be null"); + for (final A oldAllele : oldAlleles) { + if (oldAllele == null) + throw new IllegalArgumentException("old alleles cannot be null"); + final int oldAlleleIndex = alleleIndex(oldAllele); + if (oldAlleleIndex == -1) + throw new IllegalArgumentException("missing old allele " + oldAllele + " in likelihood collection "); + if (oldToNewAlleleIndexMap[oldAlleleIndex] != -1) + throw new IllegalArgumentException("collision: two new alleles make reference to the same old allele"); + oldToNewAlleleIndexMap[oldAlleleIndex] = i; + } + } + return oldToNewAlleleIndexMap; + } + + /** + * Remove those reads that do not overlap certain genomic location. + * + *

+ * This method modifies the current read-likelihoods collection. + *

+ * + * @param location the target location. + * + * @throws IllegalArgumentException the location cannot be {@code null} nor unmapped. + */ + @SuppressWarnings("unused") + public void filterToOnlyOverlappingUnclippedReads(final GenomeLoc location) { + if (location == null) + throw new IllegalArgumentException("the location cannot be null"); + if (location.isUnmapped()) + throw new IllegalArgumentException("the location cannot be unmapped"); + + final int sampleCount = samples.sampleCount(); + + final int locContig = location.getContigIndex(); + final int locStart = location.getStart(); + final int locEnd = location.getStop(); + + final int alleleCount = alleles.alleleCount(); + final IntArrayList removeIndices = new IntArrayList(10); + for (int s = 0; s < sampleCount; s++) { + int readRemoveCount = 0; + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + for (int r = 0; r < sampleReadCount; r++) + if (!unclippedReadOverlapsRegion(sampleReads[r], locContig, locStart, locEnd)) + removeIndices.add(r); + removeSampleReads(s,removeIndices,alleleCount); + removeIndices.clear(); + } + } + + // Compare the read coordinates to the location of interest. + private boolean readOverlapsLocation(final String contig, final int locStart, + final int locEnd, final GATKSAMRecord read) { + final boolean overlaps; + + if (read.getReadUnmappedFlag()) + overlaps = false; + else if (!read.getReferenceName().equals(contig)) + overlaps = false; + else { + int alnStart = read.getAlignmentStart(); + int alnStop = read.getAlignmentEnd(); + if (alnStart > alnStop) { // Paranoia? based on GLP.createGenomeLoc(Read) this can happen?. + final int end = alnStart; + alnStart = alnStop; + alnStop = end; + } + overlaps = !(alnStop < locStart || alnStart > locEnd); + } + return overlaps; + } + + /** + * Removes those read that the best possible likelihood given any allele is just too low. + * + *

+ * This is determined by a maximum error per read-base against the best likelihood possible. + *

+ * + * @param maximumErrorPerBase the minimum acceptable error rate per read base, must be + * a positive number. + * + * @throws IllegalStateException is not supported for read-likelihood that do not contain alleles. + * + * @throws IllegalArgumentException if {@code maximumErrorPerBase} is negative. + */ + public void filterPoorlyModeledReads(final double maximumErrorPerBase) { + if (alleles.alleleCount() == 0) + throw new IllegalStateException("unsupported for read-likelihood collections with no alleles"); + if (Double.isNaN(maximumErrorPerBase) || maximumErrorPerBase <= 0.0) + throw new IllegalArgumentException("the maximum error per base must be a positive number"); + final int sampleCount = samples.sampleCount(); + + final int alleleCount = alleles.alleleCount(); + final IntArrayList removeIndices = new IntArrayList(10); + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int sampleReadCount = sampleReads.length; + for (int r = 0; r < sampleReadCount; r++) { + final GATKSAMRecord read = sampleReads[r]; + if (readIsPoorlyModelled(s,r,read, maximumErrorPerBase)) + removeIndices.add(r); + } + removeSampleReads(s, removeIndices, alleleCount); + removeIndices.clear(); + } + } + + // Check whether the read is poorly modelled. + protected boolean readIsPoorlyModelled(final int sampleIndex, final int readIndex, final GATKSAMRecord read, final double maxErrorRatePerBase) { + final double maxErrorsForRead = Math.min(2.0, Math.ceil(read.getReadLength() * maxErrorRatePerBase)); + final double log10QualPerBase = -4.0; + final double log10MaxLikelihoodForTrueAllele = maxErrorsForRead * log10QualPerBase; + + final int alleleCount = alleles.alleleCount(); + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + for (int a = 0; a < alleleCount; a++) + if (sampleValues[a][readIndex] >= log10MaxLikelihoodForTrueAllele) + return false; + return true; + } + + + /** + * Add more reads to the collection. + * + * @param readsBySample reads to add. + * @param initialLikelihood the likelihood for the new entries. + * + * @throws IllegalArgumentException if {@code readsBySample} is {@code null} or {@code readsBySample} contains + * {@code null} reads, or {@code readsBySample} contains read that are already present in the read-likelihood + * collection. + */ + public void addReads(final Map> readsBySample, final double initialLikelihood) { + + for (final Map.Entry> entry : readsBySample.entrySet()) { + + final String sample = entry.getKey(); + final List newSampleReads = entry.getValue(); + final int sampleIndex = samples.sampleIndex(sample); + + if (sampleIndex == -1) + throw new IllegalArgumentException("input sample " + sample + + " is not part of the read-likelihoods collection"); + + if (newSampleReads == null || newSampleReads.size() == 0) + continue; + + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + final int newSampleReadCount = sampleReadCount + newSampleReads.size(); + + appendReads(newSampleReads, sampleIndex, sampleReadCount, newSampleReadCount); + extendsLikelihoodArrays(initialLikelihood, sampleIndex, sampleReadCount, newSampleReadCount); + } + } + + // Extends the likelihood arrays-matrices. + private void extendsLikelihoodArrays(double initialLikelihood, int sampleIndex, int sampleReadCount, int newSampleReadCount) { + final double[][] sampleValues = valuesBySampleIndex[sampleIndex]; + final int alleleCount = alleles.alleleCount(); + for (int a = 0; a < alleleCount; a++) + sampleValues[a] = Arrays.copyOf(sampleValues[a], newSampleReadCount); + if (initialLikelihood != 0.0) // the default array new value. + for (int a = 0; a < alleleCount; a++) + Arrays.fill(sampleValues[a],sampleReadCount,newSampleReadCount,initialLikelihood); + } + + // Append the new read reference into the structure per-sample. + private void appendReads(final List newSampleReads, final int sampleIndex, + final int sampleReadCount, final int newSampleReadCount) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex] = + Arrays.copyOf(readsBySampleIndex[sampleIndex], newSampleReadCount); + + int nextReadIndex = sampleReadCount; + final Object2IntMap sampleReadIndex = readIndexBySampleIndex[sampleIndex]; + for (final GATKSAMRecord newRead : newSampleReads) { + // if (sampleReadIndex.containsKey(newRead)) // might be worth handle this without exception (ignore the read?) but in practice should never be the case. + // throw new IllegalArgumentException("you cannot add reads that are already in read-likelihood collection"); + if (sampleReadIndex != null ) sampleReadIndex.put(newRead,nextReadIndex); + sampleReads[nextReadIndex++] = newRead; + } + } + + /** + * Adds the non-reference allele to the read-likelihood collection setting each read likelihood to the second + * best found (or best one if only one allele has likelihood). + * + *

Nothing will happen if the read-likelihoods collection already includes the non-ref allele

+ * + *

+ * Implementation note: even when strictly speaking we do not need to demand the calling code to pass + * the reference the non-ref allele, we still demand it in order to lead the + * the calling code to use the right generic type for this likelihoods + * collection {@link Allele}. + *

+ * + * @param nonRefAllele the non-ref allele. + * + * @throws IllegalArgumentException if {@code nonRefAllele} is anything but the designated <NON_REF> + * symbolic allele {@link GATKVariantContextUtils#NON_REF_SYMBOLIC_ALLELE}. + */ + public void addNonReferenceAllele(final A nonRefAllele) { + + if (nonRefAllele == null) + throw new IllegalArgumentException("non-ref allele cannot be null"); + if (!nonRefAllele.equals(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE)) + throw new IllegalArgumentException("the non-ref allele is not valid"); + // Already present? + if (alleles.alleleIndex(nonRefAllele) != -1) + return; + + final int oldAlleleCount = alleles.alleleCount(); + final int newAlleleCount = oldAlleleCount + 1; + @SuppressWarnings("unchecked") + final A[] newAlleles = (A[]) new Allele[newAlleleCount]; + for (int a = 0; a < oldAlleleCount; a++) + newAlleles[a] = alleles.alleleAt(a); + newAlleles[oldAlleleCount] = nonRefAllele; + alleles = new IndexedAlleleList<>(newAlleles); + alleleList = null; // remove the cached alleleList. + + final int sampleCount = samples.sampleCount(); + for (int s = 0; s < sampleCount; s++) + addNonReferenceAlleleLikelihoodsPerSample(oldAlleleCount, newAlleleCount, s); + } + + // Updates per-sample structures according to the addition of the NON_REF allele. + private void addNonReferenceAlleleLikelihoodsPerSample(final int alleleCount, final int newAlleleCount, final int sampleIndex) { + final double[][] sampleValues = valuesBySampleIndex[sampleIndex] = Arrays.copyOf(valuesBySampleIndex[sampleIndex], newAlleleCount); + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + + final double[] nonRefAlleleLikelihoods = sampleValues[alleleCount] = new double [sampleReadCount]; + Arrays.fill(nonRefAlleleLikelihoods,Double.NEGATIVE_INFINITY); + for (int r = 0; r < sampleReadCount; r++) { + final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); + final double secondBestLikelihood = Double.isInfinite(bestAllele.confidence) ? bestAllele.likelihood + : bestAllele.likelihood - bestAllele.confidence; + nonRefAlleleLikelihoods[r] = secondBestLikelihood; + } + } + + /** + * Downsamples reads based on contamination fractions making sure that all alleles are affected proportionally. + * + * @param perSampleDownsamplingFraction contamination sample map where the sample name are the keys and the + * fractions are the values. + * + * @throws IllegalArgumentException if {@code perSampleDownsamplingFraction} is {@code null}. + */ + public void contaminationDownsampling(final Map perSampleDownsamplingFraction) { + + final int sampleCount = samples.sampleCount(); + final IntArrayList readsToRemove = new IntArrayList(10); // blind estimate, can be improved? + final int alleleCount = alleles.alleleCount(); + for (int s = 0; s < sampleCount; s++) { + final String sample = samples.sampleAt(s); + final Double fractionDouble = perSampleDownsamplingFraction.get(sample); + if (fractionDouble == null) + continue; + final double fraction = fractionDouble; + if (Double.isNaN(fraction) || fraction <= 0.0) + continue; + if (fraction >= 1.0) { + final int sampleReadCount = readsBySampleIndex[s].length; + readsToRemove.ensureCapacity(sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + readsToRemove.add(r); + removeSampleReads(s,readsToRemove,alleleCount); + readsToRemove.clear(); + } + else { + final Map> readsByBestAllelesMap = readsByBestAlleleMap(s); + removeSampleReads(s,AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(readsByBestAllelesMap, fraction),alleleCount); + } + } + } + + /** + * Given a collection of likelihood in the old map format, it creates the corresponding read-likelihoods collection. + * + * @param alleleList the target list of alleles. + * @param map the likelihoods to transform. + * + * + * @throws IllegalArgumentException if {@code map} is {@code null}, or {@code map} does not contain likelihoods for all read vs allele combinations. + * + * @return never {@code null}. + */ + public static ReadLikelihoods fromPerAlleleReadLikelihoodsMap(final AlleleList alleleList, final Map map) { + + //TODO add test code for this method. + // First we need to create the read-likelihood collection with all required alleles, samples and reads. + final SampleList sampleList = new IndexedSampleList(map.keySet()); + final int alleleCount = alleleList.alleleCount(); + final Map> sampleToReads = new HashMap<>(sampleList.sampleCount()); + for (final Map.Entry entry : map.entrySet()) { + final String sample = entry.getKey(); + final PerReadAlleleLikelihoodMap sampleLikelihoods = entry.getValue(); + sampleToReads.put(sample,new ArrayList<>(sampleLikelihoods.getLikelihoodReadMap().keySet())); + } + + final ReadLikelihoods result = new ReadLikelihoods<>(sampleList,alleleList,sampleToReads); + + // Now set the likelihoods. + for (final Map.Entry sampleEntry : map.entrySet()) { + final ReadLikelihoods.Matrix sampleMatrix = result.sampleMatrix(result.sampleIndex(sampleEntry.getKey())); + for (final Map.Entry> readEntry : sampleEntry.getValue().getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = readEntry.getKey(); + final int readIndex = sampleMatrix.readIndex(read); + final Map alleleToLikelihoodMap = readEntry.getValue(); + for (int a = 0; a < alleleCount; a++) { + final Allele allele = alleleList.alleleAt(a); + final Double likelihood = alleleToLikelihoodMap.get(allele); + if (likelihood == null) + throw new IllegalArgumentException("there is no likelihood for allele " + allele + " and read " + read); + sampleMatrix.set(a,readIndex,likelihood); + } + } + } + return result; + } + + /** + * Returns the collection of best allele estimates for the reads based on the read-likelihoods. + * + * @throws IllegalStateException if there is no alleles. + * + * @return never {@code null}, one element per read in the read-likelihoods collection. + */ + public Collection bestAlleles() { + final List result = new ArrayList<>(100); // blind estimate. + final int sampleCount = samples.sampleCount(); + for (int s = 0; s < sampleCount; s++) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[s]; + final int readCount = sampleReads.length; + for (int r = 0; r < readCount; r++) + result.add(searchBestAllele(s,r,true)); + } + return result; + } + + /** + * Returns reads stratified by their best allele. + * @param sampleIndex the target sample. + * @return never {@code null}, perhaps empty. + */ + public Map> readsByBestAlleleMap(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final int alleleCount = alleles.alleleCount(); + final int sampleReadCount = readsBySampleIndex[sampleIndex].length; + final Map> result = new HashMap<>(alleleCount); + for (int a = 0; a < alleleCount; a++) + result.put(alleles.alleleAt(a),new ArrayList(sampleReadCount)); + readsByBestAlleleMap(sampleIndex,result); + return result; + } + + /** + * Returns reads stratified by their best allele. + * @return never {@code null}, perhaps empty. + */ + @SuppressWarnings("unused") + public Map> readsByBestAlleleMap() { + final int alleleCount = alleles.alleleCount(); + final Map> result = new HashMap<>(alleleCount); + final int totalReadCount = readCount(); + for (int a = 0; a < alleleCount; a++) + result.put(alleles.alleleAt(a),new ArrayList(totalReadCount)); + final int sampleCount = samples.sampleCount(); + for (int s = 0; s < sampleCount; s++) + readsByBestAlleleMap(s,result); + return result; + } + + private void readsByBestAlleleMap(final int sampleIndex, final Map> result) { + final GATKSAMRecord[] reads = readsBySampleIndex[sampleIndex]; + final int readCount = reads.length; + + for (int r = 0; r < readCount; r++) { + final BestAllele bestAllele = searchBestAllele(sampleIndex,r,true); + if (!bestAllele.isInformative()) + continue; + result.get(bestAllele.allele).add(bestAllele.read); + } + } + + /** + * Returns the index of a read within a sample read-likelihood sub collection. + * @param sampleIndex the sample index. + * @param read the query read. + * @return -1 if there is no such read in that sample, 0 or greater otherwise. + */ + @SuppressWarnings("unused") + public int readIndex(final int sampleIndex, final GATKSAMRecord read) { + final Object2IntMap readIndex = readIndexBySampleIndex(sampleIndex); + if (readIndex.containsKey(read)) + return readIndexBySampleIndex(sampleIndex).getInt(read); + else + return -1; + } + + /** + * Returns the total number of reads in the read-likelihood collection. + * + * @return never {@code null} + */ + public int readCount() { + int sum = 0; + final int sampleCount = samples.sampleCount(); + for (int i = 0; i < sampleCount; i++) + sum += readsBySampleIndex[i].length; + return sum; + } + + /** + * Returns the number of reads that belong to a sample in the read-likelihood collection. + * @param sampleIndex the query sample index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is not a valid sample index. + * @return 0 or greater. + */ + public int sampleReadCount(int sampleIndex) { + checkSampleIndex(sampleIndex); + return readsBySampleIndex[sampleIndex].length; + } + + /** + * Contains information about the best allele for a read search result. + */ + public class BestAllele { + public static final double INFORMATIVE_THRESHOLD = 0.2; + + /** + * Null if there is no possible match (no allele?). + */ + public final A allele; + + /** + * The containing sample. + */ + public final String sample; + + /** + * The query read. + */ + public final GATKSAMRecord read; + + /** + * If allele != null, the indicates the likelihood of the read. + */ + public final double likelihood; + + /** + * Confidence that the read actually was generated under that likelihood. + * This is equal to the difference between this and the second best allele match. + */ + public final double confidence; + + private BestAllele(final int sampleIndex, final int readIndex, final int bestAlleleIndex, + final double likelihood, final double secondBestLikelihood) { + allele = bestAlleleIndex == -1 ? null : alleles.alleleAt(bestAlleleIndex); + this.likelihood = likelihood; + sample = samples.sampleAt(sampleIndex); + read = readsBySampleIndex[sampleIndex][readIndex]; + confidence = likelihood == secondBestLikelihood ? 0 : likelihood - secondBestLikelihood; + } + + public boolean isInformative() { + return confidence > INFORMATIVE_THRESHOLD; + } + } + + private void removeSampleReads(final int sampleIndex, final IntArrayList indexToRemove, final int alleleCount) { + final int removeCount = indexToRemove.size(); + if (removeCount == 0) + return; + + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + + final Object2IntMap indexByRead = readIndexBySampleIndex[sampleIndex]; + if (indexByRead != null) + for (int i = 0; i < removeCount; i++) + indexByRead.remove(sampleReads[indexToRemove.getInt(i)]); + final boolean[] removeIndex = new boolean[sampleReadCount]; + int firstDeleted = indexToRemove.get(0); + for (int i = 0; i < removeCount; i++) + removeIndex[indexToRemove.get(i)] = true; + + final int newSampleReadCount = sampleReadCount - removeCount; + + // Now we skim out the removed reads from the read array. + final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[sampleIndex]; + final GATKSAMRecord[] newSampleReads = new GATKSAMRecord[newSampleReadCount]; + + System.arraycopy(oldSampleReads,0,newSampleReads,0,firstDeleted); + Utils.skimArray(oldSampleReads,firstDeleted, newSampleReads, firstDeleted, removeIndex, firstDeleted); + + // Then we skim out the likelihoods of the removed reads. + final double[][] oldSampleValues = valuesBySampleIndex[sampleIndex]; + final double[][] newSampleValues = new double[alleleCount][newSampleReadCount]; + for (int a = 0; a < alleleCount; a++) { + System.arraycopy(oldSampleValues[a],0,newSampleValues[a],0,firstDeleted); + Utils.skimArray(oldSampleValues[a], firstDeleted, newSampleValues[a], firstDeleted, removeIndex, firstDeleted); + } + valuesBySampleIndex[sampleIndex] = newSampleValues; + readsBySampleIndex[sampleIndex] = newSampleReads; + readListBySampleIndex[sampleIndex] = null; // reset the unmodifiable list. + } + + + // Requires that the collection passed iterator can remove elements, and it can be modified. + private void removeSampleReads(final int sampleIndex, final Collection readsToRemove, final int alleleCount) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + + final Object2IntMap indexByRead = readIndexBySampleIndex(sampleIndex); + // Count how many we are going to remove, which ones (indexes) and remove entry from the read-index map. + final boolean[] removeIndex = new boolean[sampleReadCount]; + int removeCount = 0; // captures the number of deletions. + int firstDeleted = sampleReadCount; // captures the first position that was deleted. + + final Iterator readsToRemoveIterator = readsToRemove.iterator(); + while (readsToRemoveIterator.hasNext()) { + final GATKSAMRecord read = readsToRemoveIterator.next(); + if (indexByRead.containsKey(read)) { + final int index = indexByRead.getInt(read); + if (firstDeleted > index) + firstDeleted = index; + removeCount++; + removeIndex[index] = true; + readsToRemoveIterator.remove(); + indexByRead.remove(read); + } + } + + // Nothing to remove we just finish here. + if (removeCount == 0) + return; + + final int newSampleReadCount = sampleReadCount - removeCount; + + // Now we skim out the removed reads from the read array. + final GATKSAMRecord[] oldSampleReads = readsBySampleIndex[sampleIndex]; + final GATKSAMRecord[] newSampleReads = new GATKSAMRecord[newSampleReadCount]; + + System.arraycopy(oldSampleReads,0,newSampleReads,0,firstDeleted); + Utils.skimArray(oldSampleReads,firstDeleted, newSampleReads, firstDeleted, removeIndex, firstDeleted); + + // Update the indices for the extant reads from the first deletion onwards. + for (int r = firstDeleted; r < newSampleReadCount; r++) { + indexByRead.put(newSampleReads[r], r); + } + + // Then we skim out the likelihoods of the removed reads. + final double[][] oldSampleValues = valuesBySampleIndex[sampleIndex]; + final double[][] newSampleValues = new double[alleleCount][newSampleReadCount]; + for (int a = 0; a < alleleCount; a++) { + System.arraycopy(oldSampleValues[a],0,newSampleValues[a],0,firstDeleted); + Utils.skimArray(oldSampleValues[a], firstDeleted, newSampleValues[a], firstDeleted, removeIndex, firstDeleted); + } + valuesBySampleIndex[sampleIndex] = newSampleValues; + readsBySampleIndex[sampleIndex] = newSampleReads; + readListBySampleIndex[sampleIndex] = null; // reset the unmodifiable list. + } + + private Object2IntMap readIndexBySampleIndex(final int sampleIndex) { + if (readIndexBySampleIndex[sampleIndex] == null) { + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + readIndexBySampleIndex[sampleIndex] = new Object2IntOpenHashMap<>(sampleReadCount); + for (int r = 0; r < sampleReadCount; r++) + readIndexBySampleIndex[sampleIndex].put(sampleReads[r],r); + } + return readIndexBySampleIndex[sampleIndex]; + } + + /** + * Transform into a multi-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. + * @return never {@code null}. + * + * @deprecated + * + * This method should eventually disappear once we have removed PerReadAlleleLikelihoodMap class completelly. + */ + @Deprecated + @SuppressWarnings("all") + public Map toPerReadAlleleLikelihoodMap() { + final int sampleCount = samples.sampleCount(); + final Map result = new HashMap<>(sampleCount); + for (int s = 0; s < sampleCount; s++) + result.put(samples.sampleAt(s),toPerReadAlleleLikelihoodMap(s)); + return result; + } + + /** + * Transform into a single-sample HashMap backed {@link PerReadAlleleLikelihoodMap} type. + * + * @return never {@code null}. + */ + @Deprecated + public PerReadAlleleLikelihoodMap toPerReadAlleleLikelihoodMap(final int sampleIndex) { + checkSampleIndex(sampleIndex); + final PerReadAlleleLikelihoodMap result = new PerReadAlleleLikelihoodMap(); + final int alleleCount = alleles.alleleCount(); + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + final int sampleReadCount = sampleReads.length; + for (int a = 0; a < alleleCount; a++) { + final A allele = alleles.alleleAt(a); + final double[] readLikelihoods = valuesBySampleIndex[sampleIndex][a]; + for (int r = 0; r < sampleReadCount; r++) + result.add(sampleReads[r], allele, readLikelihoods[r]); + } + return result; + } + + /** + * Implements a likelihood matrix per sample given its index. + */ + private class SampleMatrix implements Matrix
{ + + private final int sampleIndex; + + private SampleMatrix(final int sampleIndex) { + this.sampleIndex = sampleIndex; + } + + @Override + public List reads() { + return sampleReads(sampleIndex); + } + + @Override + public List alleles() { + return ReadLikelihoods.this.alleles(); + } + + @Override + public void set(final int alleleIndex, final int readIndex, final double value) { + valuesBySampleIndex[sampleIndex][alleleIndex][readIndex] = value; + } + + @Override + public double get(final int alleleIndex, final int readIndex) { + return valuesBySampleIndex[sampleIndex][alleleIndex][readIndex]; + } + + @Override + public int alleleIndex(final A allele) { + return ReadLikelihoods.this.alleleIndex(allele); + } + + @Override + public int readIndex(final GATKSAMRecord read) { + return ReadLikelihoods.this.readIndex(sampleIndex, read); + } + + @Override + public int alleleCount() { + return alleles.alleleCount(); + } + + @Override + public int readCount() { + return readsBySampleIndex[sampleIndex].length; + } + + @Override + public A alleleAt(int alleleIndex) { + return ReadLikelihoods.this.alleleAt(alleleIndex); + } + + @Override + public GATKSAMRecord readAt(final int readIndex) { + if (readIndex < 0) + throw new IllegalArgumentException("the read-index cannot be negative"); + final GATKSAMRecord[] sampleReads = readsBySampleIndex[sampleIndex]; + if (readIndex >= sampleReads.length) + throw new IllegalArgumentException("the read-index is beyond the read count of the sample"); + return sampleReads[readIndex]; + } + + @Override + public void copyAlleleLikelihoods(final int alleleIndex, final double[] dest, final int offset) { + System.arraycopy(valuesBySampleIndex[sampleIndex][alleleIndex],0,dest,offset,readCount()); + } + } + + /** + * Checks whether the provide sample index is valid. + *

+ * If not, it throws an exception. + *

+ * @param sampleIndex the target sample index. + * + * @throws IllegalArgumentException if {@code sampleIndex} is invalid, i.e. outside the range [0,{@link #sampleCount}). + */ + private void checkSampleIndex(final int sampleIndex) { + if (sampleIndex < 0 || sampleIndex >= samples.sampleCount()) + throw new IllegalArgumentException("invalid sample index: " + sampleIndex); + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/SampleList.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/SampleList.java new file mode 100644 index 000000000..00d970824 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/SampleList.java @@ -0,0 +1,42 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +/** + * A indexed set of samples. + * + *

+ * Implementing classes must guarantee that the sample list will remain constant through the life of the object. + *

+ */ +public interface SampleList { + + public int sampleCount(); + + public int sampleIndex(final String sample); + + public String sampleAt(final int sampleIndex); +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/SampleListUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/SampleListUtils.java new file mode 100644 index 000000000..8abfafb11 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/genotyper/SampleListUtils.java @@ -0,0 +1,224 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.genotyper; + +import java.util.*; + +/** + * Some utility operations on sample lists. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class SampleListUtils { + + private static final SampleList EMPTY_LIST = new SampleList() { + + @Override + public int sampleCount() { + return 0; + } + + @Override + public int sampleIndex(String sample) { + return -1; + } + + @Override + public String sampleAt(final int sampleIndex) { + throw new IllegalArgumentException("index is out of valid range"); + } + }; + + /** + * Empty list. + * + * @return never {@code null} + */ + public static SampleList emptyList() { + return EMPTY_LIST; + } + + /** + * Checks whether two sample lists are in fact the same. + * @param first one list to compare. + * @param second another list to compare. + * + * @throws IllegalArgumentException if if either list is {@code null}. + * + * @return {@code true} iff both list are equal. + */ + public static boolean equals(final SampleList first, final SampleList second) { + if (first == null || second == null) + throw new IllegalArgumentException("no null list allowed"); + final int sampleCount = first.sampleCount(); + if (sampleCount != second.sampleCount()) + return false; + + for (int i = 0; i < sampleCount; i++) { + final String firstSample = first.sampleAt(i); + if (firstSample == null) + throw new IllegalStateException("no null samples allowed in sample-lists: first list at " + i); + final String secondSample = second.sampleAt(i); + if (secondSample == null) + throw new IllegalArgumentException("no null samples allowed in sample-list: second list at " + i); + if (!firstSample.equals(secondSample)) + return false; + } + return true; + } + + /** + * Returns a {@link List} unmodifiable view of a sample-list + * @param list the sample-list to wrap. + * + * @throws IllegalArgumentException if {@code list} is {@code null}. + * + * @return never {@code null}. + */ + public static List asList(final SampleList list) { + if (list == null) + throw new IllegalArgumentException("the list cannot be null"); + return new AsList(list); + } + + /** + * Returns a {@link Set} unmodifiable view of the sample-list + * + * @param list the sample-list to wrap. + * + * @throws IllegalArgumentException if {@code list} is {@code null} + */ + public static Set asSet(final SampleList list) { + if (list == null) + throw new IllegalArgumentException("the list cannot be null"); + return new AsSet(list); + } + + /** + * Creates a list with a single sample. + * + * @param sampleName the sample name. + * @return never {@code sampleName} + */ + public static SampleList singletonList(final String sampleName) { + if (sampleName == null) + throw new IllegalArgumentException("the sample name cannot be null"); + return new SampleList() { + + @Override + public int sampleCount() { + return 1; + } + + @Override + public int sampleIndex(final String sample) { + return sampleName.equals(sample) ? 0 : -1; + } + + @Override + public String sampleAt(int sampleIndex) { + if (sampleIndex == 0) + return sampleName; + throw new IllegalArgumentException("index is out of bounds"); + } + }; + } + + /** + * Simple list view of a sample-list. + */ + private static class AsList extends AbstractList { + + private final SampleList list; + + private AsList(final SampleList list) { + this.list = list; + + } + + @Override + public String get(int index) { + return list.sampleAt(index); + } + + @Override + public int size() { + return list.sampleCount(); + } + } + + /** + * Simple set view of a sample-list + */ + private static class AsSet extends AbstractSet { + + private final SampleList list; + + private AsSet(final SampleList list) { + this.list = list; + + } + + @Override + public Iterator iterator() { + return new Iterator() { + private int index = 0; + + @Override + public boolean hasNext() { + return index < list.sampleCount(); + } + + @Override + public String next() { + if (index >= list.sampleCount()) + throw new NoSuchElementException("iterating beyond sample list end"); + return list.sampleAt(index++); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("unsupported operation exception"); + } + }; + } + + @Override + public int size() { + return list.sampleCount(); + } + + @Override + public boolean contains(final Object obj) { + if (obj == null) + return false; + else if (obj instanceof String) + return list.sampleIndex(((String)obj)) >= 0; + else + return false; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java new file mode 100644 index 000000000..271102d64 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/EventMap.java @@ -0,0 +1,423 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.haplotype; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; + +import java.util.*; + +/** + * Extract simple VariantContext events from a single haplotype + * + * User: depristo + * Date: 3/27/13 + * Time: 8:35 AM + */ +public class EventMap extends TreeMap { + private final static Logger logger = Logger.getLogger(EventMap.class); + protected final static int MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION = 3; + private static final int MAX_EVENTS_PER_HAPLOTYPE = 3; + private static final int MAX_INDELS_PER_HAPLOTYPE = 2; + public final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); + + private final Haplotype haplotype; + private final byte[] ref; + private final GenomeLoc refLoc; + private final String sourceNameToAdd; + + public EventMap(final Haplotype haplotype, final byte[] ref, final GenomeLoc refLoc, final String sourceNameToAdd) { + super(); + this.haplotype = haplotype; + this.ref = ref; + this.refLoc = refLoc; + this.sourceNameToAdd = sourceNameToAdd; + + processCigarForInitialEvents(); + } + + /** + * For testing. Let's you set up a explicit configuration without having to process a haplotype and reference + * @param stateForTesting + */ + public EventMap(final Collection stateForTesting) { + haplotype = null; + ref = null; + refLoc = null; + sourceNameToAdd = null; + for ( final VariantContext vc : stateForTesting ) + addVC(vc); + } + + protected void processCigarForInitialEvents() { + final Cigar cigar = haplotype.getCigar(); + final byte[] alignment = haplotype.getBases(); + + int refPos = haplotype.getAlignmentStartHapwrtRef(); + if( refPos < 0 ) { + return; + } // Protection against SW failures + + final List proposedEvents = new ArrayList<>(); + + int alignmentPos = 0; + + for( int cigarIndex = 0; cigarIndex < cigar.numCigarElements(); cigarIndex++ ) { + final CigarElement ce = cigar.getCigarElement(cigarIndex); + final int elementLength = ce.getLength(); + switch( ce.getOperator() ) { + case I: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final List insertionAlleles = new ArrayList(); + final int insertionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) ) { + insertionAlleles.add( Allele.create(refByte, true) ); + } + if( cigarIndex == 0 || cigarIndex == cigar.getCigarElements().size() - 1 ) { + // if the insertion isn't completely resolved in the haplotype, skip it + // note this used to emit SYMBOLIC_UNASSEMBLED_EVENT_ALLELE but that seems dangerous + } else { + byte[] insertionBases = new byte[]{}; + insertionBases = ArrayUtils.add(insertionBases, ref[refPos - 1]); // add the padding base + insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange(alignment, alignmentPos, alignmentPos + elementLength)); + if( BaseUtils.isAllRegularBases(insertionBases) ) { + insertionAlleles.add( Allele.create(insertionBases, false) ); + } + } + if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele + proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make()); + } + } + alignmentPos += elementLength; + break; + } + case S: + { + alignmentPos += elementLength; + break; + } + case D: + { + if( refPos > 0 ) { // protect against trying to create insertions/deletions at the beginning of a contig + final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base + final List deletionAlleles = new ArrayList(); + final int deletionStart = refLoc.getStart() + refPos - 1; + final byte refByte = ref[refPos-1]; + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) { + deletionAlleles.add( Allele.create(deletionBases, true) ); + deletionAlleles.add( Allele.create(refByte, false) ); + proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make()); + } + } + refPos += elementLength; + break; + } + case M: + case EQ: + case X: + { + for( int iii = 0; iii < elementLength; iii++ ) { + final byte refByte = ref[refPos]; + final byte altByte = alignment[alignmentPos]; + if( refByte != altByte ) { // SNP! + if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { + final List snpAlleles = new ArrayList(); + snpAlleles.add( Allele.create( refByte, true ) ); + snpAlleles.add( Allele.create( altByte, false ) ); + proposedEvents.add(new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); + } + } + refPos++; + alignmentPos++; + } + break; + } + case N: + case H: + case P: + default: + throw new ReviewedGATKException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() ); + } + } + + for ( final VariantContext proposedEvent : proposedEvents ) + addVC(proposedEvent, true); + } + + /** + * Add VariantContext vc to this map, merging events with the same start sites if necessary + * @param vc the variant context to add + */ + public void addVC(final VariantContext vc) { + addVC(vc, true); + } + + /** + * Add VariantContext vc to this map + * @param vc the variant context to add + * @param merge should we attempt to merge it with an already existing element, or should we throw an error in that case? + */ + public void addVC(final VariantContext vc, final boolean merge) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + + if ( containsKey(vc.getStart()) ) { + if ( merge ) { + final VariantContext prev = get(vc.getStart()); + put(vc.getStart(), makeBlock(prev, vc)); + } else { + throw new IllegalStateException("Will not merge previously bound variant contexts as merge is false at " + vc); + } + } else + put(vc.getStart(), vc); + } + + /** + * Create a block substitution out of two variant contexts that start at the same position + * + * vc1 can be SNP, and vc2 can then be either a insertion or deletion. + * If vc1 is an indel, then vc2 must be the opposite type (vc1 deletion => vc2 must be an insertion) + * + * @param vc1 the first variant context we want to merge + * @param vc2 the second + * @return a block substitution that represents the composite substitution implied by vc1 and vc2 + */ + protected VariantContext makeBlock(final VariantContext vc1, final VariantContext vc2) { + if ( vc1.getStart() != vc2.getStart() ) throw new IllegalArgumentException("vc1 and 2 must have the same start but got " + vc1 + " and " + vc2); + if ( ! vc1.isBiallelic() ) throw new IllegalArgumentException("vc1 must be biallelic"); + if ( ! vc1.isSNP() ) { + if ( ! ((vc1.isSimpleDeletion() && vc2.isSimpleInsertion()) || (vc1.isSimpleInsertion() && vc2.isSimpleDeletion()))) + throw new IllegalArgumentException("Can only merge single insertion with deletion (or vice versa) but got " + vc1 + " merging with " + vc2); + } else if ( vc2.isSNP() ) { + throw new IllegalArgumentException("vc1 is " + vc1 + " but vc2 is a SNP, which implies there's been some terrible bug in the cigar " + vc2); + } + + final Allele ref, alt; + final VariantContextBuilder b = new VariantContextBuilder(vc1); + if ( vc1.isSNP() ) { + // we have to repair the first base, so SNP case is special cased + if ( vc1.getReference().equals(vc2.getReference()) ) { + // we've got an insertion, so we just update the alt to have the prev alt + ref = vc1.getReference(); + alt = Allele.create(vc1.getAlternateAllele(0).getDisplayString() + vc2.getAlternateAllele(0).getDisplayString().substring(1), false); + } else { + // we're dealing with a deletion, so we patch the ref + ref = vc2.getReference(); + alt = vc1.getAlternateAllele(0); + b.stop(vc2.getEnd()); + } + } else { + final VariantContext insertion = vc1.isSimpleInsertion() ? vc1 : vc2; + final VariantContext deletion = vc1.isSimpleInsertion() ? vc2 : vc1; + ref = deletion.getReference(); + alt = insertion.getAlternateAllele(0); + b.stop(deletion.getEnd()); + } + + return b.alleles(Arrays.asList(ref, alt)).make(); + } + + // TODO -- warning this is an O(N^3) algorithm because I'm just lazy. If it's valuable we need to reengineer it + @Requires("getNumberOfEvents() > 0") + protected void replaceClumpedEventsWithBlockSubstitutions() { + if ( getNumberOfEvents() >= MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) { + int lastStart = -1; + for ( boolean foundOne = true; foundOne; ) { + foundOne = false; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() > lastStart ) { + lastStart = vc.getStart(); + final List neighborhood = getNeighborhood(vc, 10); + if ( updateToBlockSubstitutionIfBetter(neighborhood) ) { + foundOne = true; + break; + } + } + } + } + } + } + + protected boolean updateToBlockSubstitutionIfBetter(final List neighbors) { + if (neighbors.size() < MIN_NUMBER_OF_EVENTS_TO_COMBINE_INTO_BLOCK_SUBSTITUTION) + return false; + // TODO -- need more tests to decide if this is really so good + + final VariantContext first = neighbors.get(0); + final int refStartOffset = first.getStart() - refLoc.getStart(); + final int refEndOffset = neighbors.get(neighbors.size() - 1).getEnd() - refLoc.getStart(); + + final byte[] refBases = Arrays.copyOfRange(ref, refStartOffset, refEndOffset + 1); + final byte[] hapBases = AlignmentUtils.getBasesCoveringRefInterval(refStartOffset, refEndOffset, haplotype.getBases(), haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar()); + + final VariantContextBuilder builder = new VariantContextBuilder(first); + builder.stop(first.getStart() + refBases.length - 1); + builder.alleles(Arrays.asList(Allele.create(refBases, true), Allele.create(hapBases))); + final VariantContext block = builder.make(); + + // remove all merged events + for ( final VariantContext merged : neighbors ) { + if ( remove(merged.getStart()) == null ) + throw new IllegalArgumentException("Expected to remove variant context from the event map but remove said there wasn't any element there: " + merged); + } + + // note must be after we remove the previous events as the treeset only allows one key per start + logger.info("Transforming into block substitution at " + block); + addVC(block, false); + + return true; + } + + /** + * Get all of the variant contexts starting at leftMost that are within maxBP of each other + * + * @param leftMost the left most (smallest position) variant context that will start the neighborhood + * @param maxBPBetweenEvents the maximum distance in BP between the end of one event the start of the next + * to be included the the resulting list + * @return a list that contains at least one element (leftMost) + */ + @Requires({"leftMost != null", "maxBPBetweenEvents >= 0"}) + @Ensures({"result != null", "! result.isEmpty()"}) + protected List getNeighborhood(final VariantContext leftMost, final int maxBPBetweenEvents) { + final List neighbors = new LinkedList(); + + VariantContext left = leftMost; + for ( final VariantContext vc : getVariantContexts() ) { + if ( vc.getStart() < leftMost.getStart() ) + continue; + + if ( vc.getStart() - left.getEnd() < maxBPBetweenEvents ) { + // this vc is within max distance to the end of the left event, so accumulate it + neighbors.add(vc); + left = vc; + } + } + + return neighbors; + } + + /** + * Get the starting positions of events in this event map + * @return + */ + public Set getStartPositions() { + return keySet(); + } + + /** + * Get the variant contexts in order of start position in this event map + * @return + */ + public Collection getVariantContexts() { + return values(); + } + + /** + * How many events do we have? + * @return + */ + public int getNumberOfEvents() { + return size(); + } + + @Override + public String toString() { + final StringBuilder b = new StringBuilder("EventMap{"); + for ( final VariantContext vc : getVariantContexts() ) + b.append(String.format("%s:%d-%d %s,", vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles())); + b.append("}"); + return b.toString(); + } + + /** + * Build event maps for each haplotype, returning the sorted set of all of the starting positions of all + * events across all haplotypes + * + * @param haplotypes a list of haplotypes + * @param ref the reference bases + * @param refLoc the span of the reference bases + * @param debug if true, we'll emit debugging information during this operation + * @return a sorted set of start positions of all events among all haplotypes + */ + public static TreeSet buildEventMapsForHaplotypes( final List haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final boolean debug) { + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet startPosKeySet = new TreeSet(); + int hapNumber = 0; + + if( debug ) logger.info("=== Best Haplotypes ==="); + for( final Haplotype h : haplotypes ) { + // Walk along the alignment and turn any difference from the reference into an event + h.setEventMap( new EventMap( h, ref, refLoc, "HC" + hapNumber++ ) ); + startPosKeySet.addAll(h.getEventMap().getStartPositions()); + + if( debug ) { + logger.info(h.toString()); + logger.info("> Cigar = " + h.getCigar()); + logger.info(">> Events = " + h.getEventMap()); + } + } + + return startPosKeySet; + } + + private static class VariantContextComparator implements Comparator { + @Override + public int compare(VariantContext vc1, VariantContext vc2) { + return vc1.getStart() - vc2.getStart(); + } + } + + /** + * Get all of the VariantContexts in the event maps for all haplotypes, sorted by their start position + * @param haplotypes the set of haplotypes to grab the VCs from + * @return a sorted set of variant contexts + */ + public static TreeSet getAllVariantContexts( final List haplotypes ) { + // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file + final TreeSet vcs = new TreeSet(new VariantContextComparator()); + + for( final Haplotype h : haplotypes ) { + vcs.addAll(h.getEventMap().getVariantContexts()); + } + + return vcs; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java new file mode 100644 index 000000000..153a9a4b4 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/Haplotype.java @@ -0,0 +1,343 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.haplotype; + +import com.google.java.contract.Requires; +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.sam.AlignmentUtils; +import org.broadinstitute.gatk.utils.sam.ReadUtils; +import htsjdk.variant.variantcontext.Allele; + +import java.util.Arrays; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.List; + +public class Haplotype extends Allele { + + + private GenomeLoc genomeLocation = null; + private EventMap eventMap = null; + private Cigar cigar; + private int alignmentStartHapwrtRef; + private double score = Double.NaN; + + /** + * Main constructor + * + * @param bases a non-null array of bases + * @param isRef is this the reference haplotype? + */ + public Haplotype( final byte[] bases, final boolean isRef ) { + super(bases.clone(), isRef); + } + + /** + * Create a new non-ref haplotype + * + * @param bases a non-null array of bases + */ + public Haplotype( final byte[] bases ) { + this(bases, false); + } + + /** + * Create a new haplotype with bases + * + * Requires bases.length == cigar.getReadLength() + * + * @param bases a non-null array of bases + * @param isRef is this the reference haplotype? + * @param alignmentStartHapwrtRef offset of this haplotype w.r.t. the reference + * @param cigar the cigar that maps this haplotype to the reference sequence + */ + public Haplotype( final byte[] bases, final boolean isRef, final int alignmentStartHapwrtRef, final Cigar cigar) { + this(bases, isRef); + this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; + setCigar(cigar); + } + + /** + * Copy constructor. Note the ref state of the provided allele is ignored! + * + * @param allele allele to copy + */ + public Haplotype( final Allele allele ) { + super(allele, true); + } + + public Haplotype( final byte[] bases, final GenomeLoc loc ) { + this(bases, false); + this.genomeLocation = loc; + } + + /** + * Create a new Haplotype derived from this one that exactly spans the provided location + * + * Note that this haplotype must have a contain a genome loc for this operation to be successful. If no + * GenomeLoc is contained than @throws an IllegalStateException + * + * Also loc must be fully contained within this Haplotype's genomeLoc. If not an IllegalArgumentException is + * thrown. + * + * @param loc a location completely contained within this Haplotype's location + * @return a new Haplotype within only the bases spanning the provided location, or null for some reason the haplotype would be malformed if + */ + public Haplotype trim(final GenomeLoc loc) { + if ( loc == null ) throw new IllegalArgumentException("Loc cannot be null"); + if ( genomeLocation == null ) throw new IllegalStateException("Cannot trim a Haplotype without containing GenomeLoc"); + if ( ! genomeLocation.containsP(loc) ) throw new IllegalArgumentException("Can only trim a Haplotype to a containing span. My loc is " + genomeLocation + " but wanted trim to " + loc); + if ( getCigar() == null ) throw new IllegalArgumentException("Cannot trim haplotype without a cigar " + this); + + final int newStart = loc.getStart() - this.genomeLocation.getStart(); + final int newStop = newStart + loc.size() - 1; + final byte[] newBases = AlignmentUtils.getBasesCoveringRefInterval(newStart, newStop, getBases(), 0, getCigar()); + final Cigar newCigar = AlignmentUtils.trimCigarByReference(getCigar(), newStart, newStop); + + if ( newBases == null || AlignmentUtils.startsOrEndsWithInsertionOrDeletion(newCigar) ) + // we cannot meaningfully chop down the haplotype, so return null + return null; + + final Haplotype ret = new Haplotype(newBases, isReference()); + ret.setCigar(newCigar); + ret.setGenomeLocation(loc); + ret.setAlignmentStartHapwrtRef(newStart + getAlignmentStartHapwrtRef()); + return ret; + } + + @Override + public boolean equals( Object h ) { + return h instanceof Haplotype && Arrays.equals(getBases(), ((Haplotype) h).getBases()); + } + + @Override + public int hashCode() { + return Arrays.hashCode(getBases()); + } + + public EventMap getEventMap() { + return eventMap; + } + + public void setEventMap( final EventMap eventMap ) { + this.eventMap = eventMap; + } + + @Override + public String toString() { + return getDisplayString(); + } + + /** + * Get the span of this haplotype (may be null) + * @return a potentially null genome loc + */ + public GenomeLoc getGenomeLocation() { + return genomeLocation; + } + + public void setGenomeLocation(GenomeLoc genomeLocation) { + this.genomeLocation = genomeLocation; + } + + public long getStartPosition() { + return genomeLocation.getStart(); + } + + public long getStopPosition() { + return genomeLocation.getStop(); + } + + public int getAlignmentStartHapwrtRef() { + return alignmentStartHapwrtRef; + } + + public void setAlignmentStartHapwrtRef( final int alignmentStartHapwrtRef ) { + this.alignmentStartHapwrtRef = alignmentStartHapwrtRef; + } + + /** + * Get the cigar for this haplotype. Note that the cigar is guaranteed to be consolidated + * in that multiple adjacent equal operates will have been merged + * @return the cigar of this haplotype + */ + public Cigar getCigar() { + return cigar; + } + + /** + * Get the haplotype cigar extended by padSize M at the tail, consolidated into a clean cigar + * + * @param padSize how many additional Ms should be appended to the end of this cigar. Must be >= 0 + * @return a newly allocated Cigar that consolidate(getCigar + padSize + M) + */ + public Cigar getConsolidatedPaddedCigar(final int padSize) { + if ( padSize < 0 ) throw new IllegalArgumentException("padSize must be >= 0 but got " + padSize); + final Cigar extendedHaplotypeCigar = new Cigar(getCigar().getCigarElements()); + if ( padSize > 0 ) extendedHaplotypeCigar.add(new CigarElement(padSize, CigarOperator.M)); + return AlignmentUtils.consolidateCigar(extendedHaplotypeCigar); + } + + /** + * Set the cigar of this haplotype to cigar. + * + * Note that this function consolidates the cigar, so that 1M1M1I1M1M => 2M1I2M + * + * @param cigar a cigar whose readLength == length() + */ + public void setCigar( final Cigar cigar ) { + this.cigar = AlignmentUtils.consolidateCigar(cigar); + if ( this.cigar.getReadLength() != length() ) + throw new IllegalArgumentException("Read length " + length() + " not equal to the read length of the cigar " + cigar.getReadLength() + " " + this.cigar); + } + + @Requires({"refInsertLocation >= 0"}) + public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { + // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates + final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); + final byte[] myBases = this.getBases(); + if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= myBases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype + return null; + } + + byte[] newHaplotypeBases = new byte[]{}; + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, 0, haplotypeInsertLocation)); // bases before the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant + newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(myBases, haplotypeInsertLocation + refAllele.length(), myBases.length)); // bases after the variant + return new Haplotype(newHaplotypeBases); + } + + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, + final int startPos, + final ReferenceContext ref, + final int haplotypeSize, + final int numPrefBases) { + + LinkedHashMap haplotypeMap = new LinkedHashMap(); + + Allele refAllele = null; + + for (Allele a:alleleList) { + if (a.isReference()) { + refAllele = a; + break; + } + } + + if (refAllele == null) + throw new ReviewedGATKException("BUG: no ref alleles in input to makeHaplotypeListfrom Alleles at loc: "+ startPos); + + final byte[] refBases = ref.getBases(); + + final int startIdxInReference = 1 + startPos - numPrefBases - ref.getWindow().getStart(); + final String basesBeforeVariant = new String(Arrays.copyOfRange(refBases, startIdxInReference, startIdxInReference + numPrefBases)); + + // protect against long events that overrun available reference context + final int startAfter = Math.min(startIdxInReference + numPrefBases + refAllele.getBases().length - 1, refBases.length); + final String basesAfterVariant = new String(Arrays.copyOfRange(refBases, startAfter, refBases.length)); + + // Create location for all haplotypes + final int startLoc = ref.getWindow().getStart() + startIdxInReference; + final int stopLoc = startLoc + haplotypeSize-1; + + final GenomeLoc locus = ref.getGenomeLocParser().createGenomeLoc(ref.getLocus().getContig(),startLoc,stopLoc); + + for (final Allele a : alleleList) { + + final byte[] alleleBases = a.getBases(); + // use string concatenation + String haplotypeString = basesBeforeVariant + new String(Arrays.copyOfRange(alleleBases, 1, alleleBases.length)) + basesAfterVariant; + haplotypeString = haplotypeString.substring(0,haplotypeSize); + + haplotypeMap.put(a,new Haplotype(haplotypeString.getBytes(), locus)); + } + + return haplotypeMap; + } + + private static class Event { + public Allele ref; + public Allele alt; + public int pos; + + public Event( final Allele ref, final Allele alt, final int pos ) { + this.ref = ref; + this.alt = alt; + this.pos = pos; + } + } + + /** + * Get the score (an estimate of the support) of this haplotype + * @return a double, where higher values are better + */ + public double getScore() { + return score; + } + + /** + * Set the score (an estimate of the support) of this haplotype. + * + * Note that if this is the reference haplotype it is always given Double.MAX_VALUE score + * + * @param score a double, where higher values are better + */ + public void setScore(double score) { + this.score = score; + } + + /** + * Comparator used to sort haplotypes, alphanumerically. + * + *

+ * If one haplotype is the prefix of the other, the shorter one comes first. + *

+ */ + public static final Comparator ALPHANUMERICAL_COMPARATOR = new Comparator() { + + @Override + public int compare(final Haplotype o1, final Haplotype o2) { + if (o1 == o2) + return 0; + final byte[] bases1 = o1.getBases(); + final byte[] bases2 = o2.getBases(); + final int iLimit = Math.min(bases1.length, bases2.length); + for (int i = 0; i < iLimit; i++) { + final int cmp = Byte.compare(bases1[i], bases2[i]); + if (cmp != 0) return cmp; + } + if (bases1.length == bases2.length) return 0; + return (bases1.length > bases2.length) ? -1 : 1; // is a bit better to get the longest haplotypes first. + } + }; + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeBaseComparator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeScoreComparator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeSizeAndBaseComparator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ApplicationDetails.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ApplicationDetails.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ApplicationDetails.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ApplicationDetails.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocletUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocletUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocletUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocletUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeature.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeature.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeature.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureHandler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureHandler.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureHandler.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureHandler.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureObject.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureObject.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureObject.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/DocumentedGATKFeatureObject.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ForumAPIUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ForumAPIUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ForumAPIUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ForumAPIUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ForumDiscussion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ForumDiscussion.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/ForumDiscussion.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ForumDiscussion.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocWorkUnit.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocWorkUnit.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocWorkUnit.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GATKDocWorkUnit.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GATKDoclet.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GATKDoclet.java new file mode 100644 index 000000000..af56eb9b2 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GATKDoclet.java @@ -0,0 +1,574 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.help; + +import com.google.gson.ExclusionStrategy; +import com.google.gson.FieldAttributes; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.annotations.Expose; +import com.google.gson.stream.JsonWriter; +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.RootDoc; +import freemarker.template.Configuration; +import freemarker.template.DefaultObjectWrapper; +import freemarker.template.Template; +import freemarker.template.TemplateException; +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import htsjdk.tribble.FeatureCodec; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.text.XReadLines; + +import java.io.*; +import java.util.*; + +/** + * Javadoc Doclet that combines javadoc, GATK ParsingEngine annotations, and FreeMarker + * templates to produce PHP formatted GATKDocs for classes. + *

+ * This document has the following workflow: + *

+ * 1 -- walk the javadoc hierarchy, looking for class that have the + * DocumentedGATKFeature annotation or are in the type hierarchy in the + * static list of things to document, and are to be documented + * 2 -- construct for each a GATKDocWorkUnit, resulting in the complete + * set of things to document + * 3 -- for each unit, actually generate a PHP page documenting it + * as well as links to related features via their units. Writing + * of a specific class PHP is accomplished by a generate DocumentationHandler + * 4 -- write out an index of all units, organized by group + * 5 -- emit JSON version of GATKDocs using Google GSON (currently incomplete but workable) + *

+ * The documented classes are restricted to only those with @DocumentedGATKFeature + * annotation or are in the STATIC_DOCS class. + */ +public abstract class GATKDoclet { + final protected static Logger logger = Logger.getLogger(GATKDoclet.class); + + /** + * Where we find the help FreeMarker templates + */ + final protected static File SETTINGS_DIR = new File("settings/helpTemplates"); + + /** + * Where we write the GATKDoc PHP directory + */ + final protected static File DESTINATION_DIR = new File("gatkdocs"); + + final private static String FORUM_KEY_PATH = "/local/gsa-engineering/gatkdocs_publisher/forum.key"; + // ---------------------------------------------------------------------- + // + // Global variables that are set on the command line by javadoc + // + // ---------------------------------------------------------------------- + protected static File settingsDir = SETTINGS_DIR; + protected static File destinationDir = DESTINATION_DIR; + protected static String forumKeyPath = FORUM_KEY_PATH; + protected static String buildTimestamp = null, absoluteVersion = null; + protected static boolean showHiddenFeatures = false; + + protected static boolean testOnly = false; + + /** + * The javadoc root doc + */ + RootDoc rootDoc; + + /** + * The set of all things we are going to document + */ + Set myWorkUnits; + + /** + * A static list of DocumentedGATKFeatureObjects. Any class that is as or extends + * one of the DocumentedGATKFeatureObjects.clazz of this collection will also + * be documented, even if it doesn't have the @DocumentedGATKFeature annotation. Useful + * when you want to document things that implement an interface (annotations on java + * interfaces aren't inherited) or whose base class isn't under your control (tribble + * codecs). + */ + final static Collection STATIC_DOCS = new ArrayList(); + + static { + STATIC_DOCS.add(new DocumentedGATKFeatureObject(FeatureCodec.class, + HelpConstants.DOCS_CAT_RODCODECS, + "Tribble codecs for reading reference ordered data (ROD) files such as VCF or BED", + "NA")); + } + + /** + * Extracts the contents of certain types of javadoc and adds them to an XML file. + * + * @param rootDoc The documentation root. + * @return Whether the JavaDoc run succeeded. + * @throws java.io.IOException if output can't be written. + */ + protected boolean startProcessDocs(RootDoc rootDoc) throws IOException { + logger.setLevel(Level.INFO); + + // load arguments + for (String[] options : rootDoc.options()) { + if (options[0].equals("-settings-dir")) + settingsDir = new File(options[1]); + if (options[0].equals("-destination-dir")) + destinationDir = new File(options[1]); + if (options[0].equals("-forum-key-path")) + forumKeyPath = options[1]; + if (options[0].equals("-build-timestamp")) + buildTimestamp = options[1]; + if (options[0].equals("-absolute-version")) + absoluteVersion = options[1]; + if (options[0].equals("-include-hidden")) + showHiddenFeatures = true; + if (options[0].equals("-test")) + testOnly = true; + } + + if (!settingsDir.exists()) + throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " does not exist"); + else if (!settingsDir.isDirectory()) + throw new RuntimeException("-settings-dir " + settingsDir.getPath() + " is not a directory"); + + // process the docs + processDocs(rootDoc); + + return true; + } + + /** + * Validate the given options against options supported by this doclet. + * + * @param option Option to validate. + * @return Number of potential parameters; 0 if not supported. + */ + public static int optionLength(String option) { + if (option.equals("-settings-dir") || + option.equals("-destination-dir") || + option.equals("-forum-key-path") || + option.equals("-build-timestamp") || + option.equals("-absolute-version") || + option.equals("-include-hidden")) { + return 2; + } else if (option.equals("-test")) + return 1; + else + return 0; + } + + /** + * Are we supposed to include @Hidden annotations in our documented output? + * + * @return + */ + public boolean showHiddenFeatures() { + return showHiddenFeatures; + } + + /** + * Any class that's in this list will be included in the documentation + * when the -test argument is provided. Useful for debugging. + * Subclasses, such as WalkerDoclet, may add additional classes for debugging. + */ + protected List> getTestOnlyKeepers() { + return Collections.>singletonList(UserException.class); + } + + /** + * @param rootDoc + */ + private void processDocs(RootDoc rootDoc) { + // setup the global access to the root + this.rootDoc = rootDoc; + + try { + // print the Version number + FileUtils.writeByteArrayToFile(new File(destinationDir + "/current.version.txt"), getSimpleVersion(absoluteVersion).getBytes()); + + /* ------------------------------------------------------------------- */ + /* You should do this ONLY ONCE in the whole application life-cycle: */ + + Configuration cfg = new Configuration(); + // Specify the data source where the template files come from. + cfg.setDirectoryForTemplateLoading(settingsDir); + // Specify how templates will see the data-model. This is an advanced topic... + cfg.setObjectWrapper(new DefaultObjectWrapper()); + + myWorkUnits = computeWorkUnits(); + + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); + List> data = new ArrayList>(); + for (GATKDocWorkUnit workUnit : myWorkUnits) { + data.add(workUnit.indexDataMap()); + if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } + } + + for (GATKDocWorkUnit workUnit : myWorkUnits) { + processDocWorkUnit(cfg, workUnit, groups, data); + } + + processIndex(cfg, new ArrayList(myWorkUnits)); + + File forumKeyFile = new File(forumKeyPath); + if (forumKeyFile.exists()) { + String forumKey = null; + // Read in a one-line file so we can do a for loop + for (String line : new XReadLines(forumKeyFile)) + forumKey = line; + updateForum(myWorkUnits, forumKey); + } + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void updateForum(Set docWorkUnits, String forumKey) { + //first get list of posts that need to be added + List old = ForumAPIUtils.getPostedTools(forumKey); + + for (String s : old) + System.out.println(s); + + System.out.printf("Forum has %d items%n", old.size()); + System.out.printf("Docs have %d items%n", docWorkUnits.size()); + + List toAdd = new ArrayList(); + for (GATKDocWorkUnit tool : docWorkUnits) { + if (!old.contains(tool.name)) { + System.out.println("WILL POST: " + tool.name + " TO FORUM"); + toAdd.add(tool); + } + } + + //update using list + for (GATKDocWorkUnit tool : toAdd) { + //if ( tool.name.equals("ApplyRecalibration") ) + ForumAPIUtils.postToForum(tool, forumKey); + } + } + + /** + * Returns the set of all GATKDocWorkUnits that we are going to generate docs for. + * + * @return + */ + private Set computeWorkUnits() { + TreeSet m = new TreeSet(); + + for (ClassDoc doc : rootDoc.classes()) { + //logger.debug("Considering " + doc); + Class clazz = getClassForClassDoc(doc); + + // don't add anything that's not DocumentationTest if we are in test mode + if (clazz != null && testOnly && !getTestOnlyKeepers().contains(clazz)) + continue; + + DocumentedGATKFeatureObject feature = getFeatureForClassDoc(doc); + DocumentedGATKFeatureHandler handler = createHandler(doc, feature); + if (handler != null && handler.includeInDocs(doc)) { + //logger.info("Generating documentation for class " + doc); + String filename = handler.getDestinationFilename(doc, clazz); + GATKDocWorkUnit unit = new GATKDocWorkUnit(doc.name(), + filename, feature.groupName(), feature, handler, doc, clazz, + buildTimestamp, absoluteVersion); + m.add(unit); + } + } + + return m; + } + + /** + * Create a handler capable of documenting the class doc according to feature. Returns + * null if no appropriate handler is found or doc shouldn't be documented at all. + * + * @param doc + * @param feature + * @return + */ + private DocumentedGATKFeatureHandler createHandler(ClassDoc doc, DocumentedGATKFeatureObject feature) { + if (feature != null) { + if (feature.enable()) { + DocumentedGATKFeatureHandler handler = createDocumentedGATKFeatureHandler(); + handler.setDoclet(this); + return handler; + } else { + logger.info("Skipping disabled Documentation for " + doc); + } + } + + return null; + } + + protected abstract DocumentedGATKFeatureHandler createDocumentedGATKFeatureHandler(); + + /** + * Returns the instantiated DocumentedGATKFeatureObject that describes the GATKDoc + * structure we will apply to Doc. + * + * @param doc + * @return null if this proves inappropriate or doc shouldn't be documented + */ + private DocumentedGATKFeatureObject getFeatureForClassDoc(ClassDoc doc) { + Class docClass = getClassForClassDoc(doc); + + if (docClass == null) + return null; // not annotated so it shouldn't be documented + + if (docClass.isAnnotationPresent(DocumentedGATKFeature.class)) { + DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); + return new DocumentedGATKFeatureObject(docClass, f.enable(), f.groupName(), f.summary(), f.extraDocs(), f.gotoDev()); + } else { + for (DocumentedGATKFeatureObject staticDocs : STATIC_DOCS) { + if (staticDocs.getClassToDoc().isAssignableFrom(docClass)) { + return new DocumentedGATKFeatureObject(docClass, staticDocs.enable(), staticDocs.groupName(), staticDocs.summary(), staticDocs.extraDocs(), staticDocs.gotoDev()); + } + } + return null; + } + } + + /** + * Return the Java class described by the ClassDoc doc + * + * @param doc + * @return + */ + private Class getClassForClassDoc(ClassDoc doc) { + try { + // todo -- what do I need the ? extends Object to pass the compiler? + return (Class) DocletUtils.getClassForDoc(doc); + } catch (ClassNotFoundException e) { + //logger.warn("Couldn't find class for ClassDoc " + doc); + // we got a classdoc for a class we can't find. Maybe in a library or something + return null; + } catch (NoClassDefFoundError e) { + return null; + } catch (UnsatisfiedLinkError e) { + return null; // naughty BWA bindings + } + } + + /** + * Create the php index listing all of the GATKDocs features + * + * @param cfg + * @param indexData + * @throws IOException + */ + private void processIndex(Configuration cfg, List indexData) throws IOException { + /* Get or create a template */ + Template temp = cfg.getTemplate("generic.index.template.html"); + + /* Merge data-model with template */ + Writer out = new OutputStreamWriter(new FileOutputStream(new File(destinationDir + "/index.php"))); + try { + temp.process(groupIndexData(indexData), out); + out.flush(); + } catch (TemplateException e) { + throw new ReviewedGATKException("Failed to create GATK documentation", e); + } + } + + /** + * Helpful function to create the php index. Given all of the already run GATKDocWorkUnits, + * create the high-level grouping data listing individual features by group. + * + * @param indexData + * @return + */ + private Map groupIndexData(List indexData) { + // + // root -> data -> { summary -> y, filename -> z }, etc + // -> groups -> group1, group2, etc. + Map root = new HashMap(); + + + Collections.sort(indexData); + + List> groups = new ArrayList>(); + Set seenDocumentationFeatures = new HashSet(); + List> data = new ArrayList>(); + for (GATKDocWorkUnit workUnit : indexData) { + data.add(workUnit.indexDataMap()); + if (!seenDocumentationFeatures.contains(workUnit.annotation.groupName())) { + groups.add(toMap(workUnit.annotation)); + seenDocumentationFeatures.add(workUnit.annotation.groupName()); + } + } + + //System.out.printf(groups.toString()); + + root.put("data", data); + root.put("groups", groups); + root.put("timestamp", buildTimestamp); + root.put("version", absoluteVersion); + + return root; + } + + /** + * Trivial helper routine that returns the map of name and summary given the annotation + * AND adds a super-category so that we can custom-order the categories in the index + * + * @param annotation + * @return + */ + private static final Map toMap(DocumentedGATKFeatureObject annotation) { + Map root = new HashMap(); + root.put("id", annotation.groupName().replaceAll("\\W", "")); + root.put("name", annotation.groupName()); + root.put("summary", annotation.summary()); + + /** + * Add-on super-category definitions. The assignments depend on parsing the names + * defined in HelpConstants.java so be careful of changing anything. + * Also, the super-category value strings need to be the same as used in the + * Freemarker template. This is all fairly clunky but the best I could do without + * making major changes to the DocumentedGATKFeatureObject. Doesn't help that + * Freemarker makes any scripting horribly awkward. + */ + final String supercatValue; + if (annotation.groupName().endsWith(" Tools")) supercatValue = "tools"; + else if (annotation.groupName().endsWith(" Utilities")) supercatValue = "utilities"; + else if (annotation.groupName().startsWith("Engine ")) supercatValue = "engine"; + else if (annotation.groupName().endsWith(" (DevZone)")) supercatValue = "dev"; + else supercatValue = "other"; + + root.put("supercat", supercatValue); + + return root; + } + + /** + * Helper function that finding the GATKDocWorkUnit associated with class from among all of the work units + * + * @param c the class we are looking for + * @return the GATKDocWorkUnit whose .clazz.equals(c), or null if none could be found + */ + public final GATKDocWorkUnit findWorkUnitForClass(Class c) { + for (final GATKDocWorkUnit unit : this.myWorkUnits) + if (unit.clazz.equals(c)) + return unit; + return null; + } + + /** + * Return the ClassDoc associated with clazz + * + * @param clazz + * @return + */ + public ClassDoc getClassDocForClass(Class clazz) { + return rootDoc.classNamed(clazz.getName()); + } + + /** + * High-level function that processes a single DocWorkUnit unit using its handler + * + * @param cfg + * @param unit + * @param data + * @throws IOException + */ + private void processDocWorkUnit(Configuration cfg, GATKDocWorkUnit unit, List> groups, List> data) + throws IOException { + //System.out.printf("Processing documentation for class %s%n", unit.classDoc); + unit.handler.processOne(unit); + unit.forTemplate.put("groups", groups); + unit.forTemplate.put("data", data); + // Get or create a template + Template temp = cfg.getTemplate(unit.handler.getTemplateName(unit.classDoc)); + + // Merge data-model with template + File outputPath = new File(destinationDir + "/" + unit.filename); + try { + Writer out = new OutputStreamWriter(new FileOutputStream(outputPath)); + temp.process(unit.forTemplate, out); + out.flush(); + } catch (TemplateException e) { + throw new ReviewedGATKException("Failed to create GATK documentation", e); + } + + // Create GSON-friendly object from unit.forTemplate + GSONWorkUnit gsonworkunit = new GSONWorkUnit(); + gsonworkunit.populate( unit.forTemplate.get("summary").toString(), + unit.forTemplate.get("parallel"), + unit.forTemplate.get("activeregion"), + unit.forTemplate.get("partitiontype").toString(), + unit.forTemplate.get("walkertype").toString(), + unit.forTemplate.get("gson-arguments"), + unit.forTemplate.get("refwindow"), + unit.forTemplate.get("description").toString(), + unit.forTemplate.get("name").toString(), + unit.forTemplate.get("annotinfo").toString(), + unit.forTemplate.get("readfilters"), + unit.forTemplate.get("downsampling"), + unit.forTemplate.get("group").toString(), + unit.forTemplate.get("annotfield").toString(), + unit.forTemplate.get("annotdescript") + ); + + // Prepare to write JSON entry to file + File outputPathForJSON = new File(destinationDir + "/" + unit.filename + ".json"); + + try { + BufferedWriter outJSON = new BufferedWriter(new FileWriter(outputPathForJSON)); + // Convert object to JSON + Gson gson = new GsonBuilder() + .serializeSpecialFloatingPointValues() + .setPrettyPrinting() + .create(); + String json = gson.toJson(gsonworkunit); // was run on unit.forTemplate + outJSON.write(json); + outJSON.close(); + + } catch (Exception e) { + throw new ReviewedGATKException("Failed to create JSON entry", e); + } + } + + private static String getSimpleVersion(String absoluteVersion) { + String[] parts = absoluteVersion.split("-"); + + // by skipping i=0, there is no trailing separator + for (int i = 1; i < 2; i++) { + parts[0] = parts[0].concat("-"); + parts[0] = parts[0].concat(parts[i]); + } + + return parts[0]; + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GSONArgument.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GSONArgument.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GSONArgument.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GSONArgument.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GSONWorkUnit.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GSONWorkUnit.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/GSONWorkUnit.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GSONWorkUnit.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GenericDocumentationHandler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GenericDocumentationHandler.java new file mode 100644 index 000000000..09c862aa0 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/GenericDocumentationHandler.java @@ -0,0 +1,722 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.help; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import com.sun.javadoc.ClassDoc; +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.Tag; +import org.apache.log4j.Logger; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.*; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.classloader.JVMUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.GATKException; + +import java.io.IOException; +import java.lang.reflect.*; +import java.util.*; + +/** + * + */ +public abstract class GenericDocumentationHandler extends DocumentedGATKFeatureHandler { + private static Logger logger = Logger.getLogger(GenericDocumentationHandler.class); + + /** + * The max. length of the longest of --fullName -shortName argument name + * before we prefer the shorter option. + */ + private static final int MAX_DISPLAY_NAME = 30; + + /** + * The Class we are documenting + */ + private GATKDocWorkUnit toProcess; + + @Override + public boolean includeInDocs(ClassDoc doc) { + try { + Class type = DocletUtils.getClassForDoc(doc); + boolean hidden = !getDoclet().showHiddenFeatures() && type.isAnnotationPresent(Hidden.class); + return !hidden && JVMUtils.isConcrete(type); + } catch (ClassNotFoundException e) { + return false; + } + } + + + @Override + public String getTemplateName(ClassDoc doc) throws IOException { + return "generic.template.html"; + } + + @Override + public void processOne(GATKDocWorkUnit toProcessArg) { + this.toProcess = toProcessArg; + + //System.out.printf("%s class %s%n", toProcess.group, toProcess.classDoc); + Map root = new HashMap(); + + addHighLevelBindings(root); + addArgumentBindings(root); + addRelatedBindings(root); + root.put("group", toProcess.group); + + // Adding in retrieval of peripheral info (rf annotations etc) + getClazzAnnotations(toProcess.clazz, root); + + toProcess.setHandlerContent((String) root.get("summary"), root); + } + + /** + * Add high-level summary information about toProcess to root, such as its + * name, summary, description, version, etc. + * + * @param root + */ + protected void addHighLevelBindings(Map root) { + root.put("name", toProcess.classDoc.name()); + + // Extract overrides from the doc tags. + StringBuilder summaryBuilder = new StringBuilder(); + for (Tag tag : toProcess.classDoc.firstSentenceTags()) + summaryBuilder.append(tag.text()); + root.put("summary", summaryBuilder.toString()); + root.put("description", toProcess.classDoc.commentText().substring(summaryBuilder.toString().length())); + root.put("timestamp", toProcess.buildTimestamp); + root.put("version", toProcess.absoluteVersion); + + for (Tag tag : toProcess.classDoc.tags()) { + root.put(tag.name(), tag.text()); + } + + root.put("gotoDev", toProcess.annotation.gotoDev()); + } + + /** + * Add bindings describing related GATK capabilites to toProcess + * + * @param root + */ + protected void addRelatedBindings(Map root) { + List> extraDocsData = new ArrayList>(); + + // add in all of the explicitly related items + for (final Class extraDocClass : toProcess.annotation.extraDocs()) { + final GATKDocWorkUnit otherUnit = getDoclet().findWorkUnitForClass(extraDocClass); + if (otherUnit == null) + throw new ReviewedGATKException("Requested extraDocs for class without any documentation: " + extraDocClass); + extraDocsData.add( + new HashMap() {{ + put("filename", otherUnit.filename); + put("name", otherUnit.name); + }}); + } + root.put("extradocs", extraDocsData); + } + + /** + * Add information about all of the arguments available to toProcess to root + * + * @param root + */ + protected void addArgumentBindings(Map root) { + ParsingEngine parsingEngine = createParsingEngine(); + + Map>> args = createArgumentMap(); + root.put("arguments", args); + try { + // loop over all of the arguments according to the parsing engine + for (final ArgumentSource argumentSource : parsingEngine.extractArgumentSources(DocletUtils.getClassForDoc(toProcess.classDoc))) { + ArgumentDefinition argDef = argumentSource.createArgumentDefinitions().get(0); + FieldDoc fieldDoc = getFieldDoc(toProcess.classDoc, argumentSource.field.getName()); + Map argBindings = docForArgument(fieldDoc, argumentSource, argDef); + if (!argumentSource.isHidden() || getDoclet().showHiddenFeatures()) { + final String kind = docKindOfArg(argumentSource); + argBindings.put("kind", kind); + // Retrieve default value + final Object value = argumentValue(toProcess.clazz, argumentSource); + if (value != null) { + argBindings.put("defaultValue", prettyPrintValueString(value)); + } else { + argBindings.put("defaultValue", "NA"); + } + // Retrieve min and max / hard and soft value thresholds for numeric args + if (value instanceof Number) { + if (argumentSource.field.isAnnotationPresent(Argument.class)) { + argBindings.put("minValue", argumentSource.field.getAnnotation(Argument.class).minValue()); + argBindings.put("maxValue", argumentSource.field.getAnnotation(Argument.class).maxValue()); + if (argumentSource.field.getAnnotation(Argument.class).minRecommendedValue() != Double.NEGATIVE_INFINITY) { + argBindings.put("minRecValue", argumentSource.field.getAnnotation(Argument.class).minRecommendedValue()); + } else { + argBindings.put("minRecValue", "NA"); + } + if (argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue() != Double.POSITIVE_INFINITY) { + argBindings.put("maxRecValue", argumentSource.field.getAnnotation(Argument.class).maxRecommendedValue()); + } else { + argBindings.put("maxRecValue", "NA"); + } + } + } else { + argBindings.put("minValue", "NA"); + argBindings.put("maxValue", "NA"); + argBindings.put("minRecValue", "NA"); + argBindings.put("maxRecValue", "NA"); + argBindings.put("defaultValue", "NA"); + } + // Finalize argument bindings + args.get(kind).add(argBindings); + args.get("all").add(argBindings); + } + } + + // sort the arguments + for (Map.Entry>> entry : args.entrySet()) { + entry.setValue(sortArguments(entry.getValue())); + } + // make a GSON-friendly map of arguments -- uses some hacky casting + List allGSONArgs = new ArrayList(); + for ( Map item : args.get("all")) { + GSONArgument itemGSONArg = new GSONArgument(); + + itemGSONArg.populate(item.get("summary").toString(), + item.get("name").toString(), + item.get("synonyms").toString(), + item.get("type").toString(), + item.get("required").toString(), + item.get("fulltext").toString(), + item.get("defaultValue").toString(), + item.get("minValue").toString(), + item.get("maxValue").toString(), + item.get("minRecValue").toString(), + item.get("maxRecValue").toString(), + item.get("rodTypes").toString(), + item.get("kind").toString(), + (List>)item.get("options") + ); + allGSONArgs.add(itemGSONArg); + } + root.put("gson-arguments", allGSONArgs); + + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Return the argument kind (required, advanced, hidden, etc) of this argumentSource + * + * @param argumentSource + * @return + */ + @Requires("argumentSource != null") + @Ensures("result != null") + private String docKindOfArg(ArgumentSource argumentSource) { + if (argumentSource.isRequired()) { + if (argumentSource.isInput()) return "required_in"; + else if (argumentSource.isOutput()) return "required_out"; + else if (argumentSource.isFlag()) return "required_flag"; + else return "required_param"; + } + else if (argumentSource.isAdvanced()) { + if (argumentSource.isInput()) return "advanced_in"; + else if (argumentSource.isOutput()) return "advanced_out"; + else if (argumentSource.isFlag()) return "advanced_flag"; + else return "advanced_param"; + } + else if (argumentSource.isHidden()) return "hidden"; + else if (argumentSource.isDeprecated()) return "deprecated"; + else { + if (argumentSource.isInput()) return "optional_in"; + else if (argumentSource.isOutput()) return "optional_out"; + else if (argumentSource.isFlag()) return "optional_flag"; + else return "optional_param"; + } + } + + /** + * Attempts to determine the value of argumentSource in an instantiated version of c + * + * @param c + * @param argumentSource + * @return value of argumentSource, or null if this isn't possible + */ + @Requires({"c != null", "argumentSource != null"}) + private Object argumentValue(Class c, ArgumentSource argumentSource) { + // get the value of the field + // attempt to instantiate the class + final Object instance = makeInstanceIfPossible(toProcess.clazz); + if (instance != null) { + final Object value = getFieldValue(instance, argumentSource.field.getName()); + if (value != null) + return value; + + if (argumentSource.createsTypeDefault()) { + try { // handle the case where there's an implicit default + return argumentSource.typeDefaultDocString(); + } catch (ReviewedGATKException e) { + ; // failed to create type default, don't worry about it + } + } + } + + return null; + } + + /** + * Create the argument map for holding class arguments + * + * @return + */ + private Map>> createArgumentMap() { + Map>> args = new HashMap>>(); + args.put("all", new ArrayList>()); + args.put("required_in", new ArrayList>()); + args.put("required_out", new ArrayList>()); + args.put("required_param", new ArrayList>()); + args.put("required_flag", new ArrayList>()); + args.put("optional_in", new ArrayList>()); + args.put("optional_out", new ArrayList>()); + args.put("optional_param", new ArrayList>()); + args.put("optional_flag", new ArrayList>()); + args.put("advanced_in", new ArrayList>()); + args.put("advanced_out", new ArrayList>()); + args.put("advanced_param", new ArrayList>()); + args.put("advanced_flag", new ArrayList>()); + args.put("hidden", new ArrayList>()); + args.put("deprecated", new ArrayList>()); + return args; + } + + + /** + * Sorts the individual argument list in unsorted according to CompareArgumentsByName + * + * @param unsorted + * @return + */ + private List> sortArguments(List> unsorted) { + Collections.sort(unsorted, new CompareArgumentsByName()); + return unsorted; + } + + /** + * Sort arguments by case-insensitive comparison ignoring the -- and - prefixes + */ + private class CompareArgumentsByName implements Comparator> { + public int compare(Map x, Map y) { + return elt(x).compareTo(elt(y)); + } + + private String elt(Map m) { + String v = m.get("name").toString().toLowerCase(); + if (v.startsWith("--")) + return v.substring(2); + else if (v.startsWith("-")) + return v.substring(1); + else + throw new RuntimeException("Expect to see arguments beginning with at least one -, but found " + v); + } + } + + /** + * Umbrella function that groups the collection of values for specific annotations applied to an + * instance of class c. Lists of collected values are added directly to the "toProcess" object. + * Requires being able to instantiate the class. + * + * @param classToProcess the object to instantiate and query for the annotation + * @param root the root of the document handler, to which we'll store collected annotations + */ + protected abstract void getClazzAnnotations(Class classToProcess, Map root); + + /** + * Utility function that finds the value of fieldName in any fields of ArgumentCollection fields in + * instance of class c. + * + * @param instance the object to query for the field value + * @param fieldName the name of the field we are looking for in instance + * @return The value assigned to field in the ArgumentCollection, otherwise null + */ + private Object getFieldValue(Object instance, String fieldName) { + // + // subtle note. If you have a field named X that is an ArgumentCollection that + // contains a field X as well, you need only consider fields in the argumentCollection, not + // matching the argument itself. + // + // @ArgumentCollection + // protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + // + + for (Field field : JVMUtils.getAllFields(instance.getClass())) { + if (field.isAnnotationPresent(ArgumentCollection.class)) { + //System.out.printf("Searching for %s in argument collection field %s%n", fieldName, field); + Object fieldValue = JVMUtils.getFieldValue(field, instance); + Object value = getFieldValue(fieldValue, fieldName); + if (value != null) + return value; + } else if (field.getName().equals(fieldName)) { + return JVMUtils.getFieldValue(field, instance); + } + } + + return null; + } + + /** + * Pretty prints value + *

+ * Assumes value != null + * + * @param value + * @return + */ + private Object prettyPrintValueString(Object value) { + if (value.getClass().isArray()) { + Class type = value.getClass().getComponentType(); + if (boolean.class.isAssignableFrom(type)) + return Arrays.toString((boolean[]) value); + if (byte.class.isAssignableFrom(type)) + return Arrays.toString((byte[]) value); + if (char.class.isAssignableFrom(type)) + return Arrays.toString((char[]) value); + if (double.class.isAssignableFrom(type)) + return Arrays.toString((double[]) value); + if (float.class.isAssignableFrom(type)) + return Arrays.toString((float[]) value); + if (int.class.isAssignableFrom(type)) + return Arrays.toString((int[]) value); + if (long.class.isAssignableFrom(type)) + return Arrays.toString((long[]) value); + if (short.class.isAssignableFrom(type)) + return Arrays.toString((short[]) value); + if (Object.class.isAssignableFrom(type)) + return Arrays.toString((Object[]) value); + else + throw new RuntimeException("Unexpected array type in prettyPrintValue. Value was " + value + " type is " + type); + } else if (RodBinding.class.isAssignableFrom(value.getClass())) { + // annoying special case to handle the UnBound() constructor + return "none"; + } else if (value instanceof String) { + return value.equals("") ? "\"\"" : value; + } else { + return value.toString(); + } + } + + /** + * Attempt to instantiate class c, if possible. Returns null if this proves impossible. + * + * @param c + * @return + */ + protected Object makeInstanceIfPossible(Class c) { + Object instance = null; + try { + // don't try to make something where we will obviously fail + if (!c.isEnum() && !c.isAnnotation() && !c.isAnonymousClass() && + !c.isArray() && !c.isPrimitive() & JVMUtils.isConcrete(c)) { + instance = c.newInstance(); + //System.out.printf("Created object of class %s => %s%n", c, instance); + return instance; + } else + return null; + } catch (IllegalAccessException e) { + } catch (InstantiationException e) { + } catch (ExceptionInInitializerError e) { + } catch (SecurityException e) { + } + // this last one is super dangerous, but some of these methods catch ClassNotFoundExceptions + // and rethrow then as RuntimeExceptions + catch (RuntimeException e) { + } + + return instance; + } + + + /** + * Create an instance of the GATK parsing engine, for argument processing with GATKDoclet + * + * @return + */ + private ParsingEngine createParsingEngine() { + CommandLineProgram clp = createCommandLineProgram(); + try { + CommandLineProgram.start(clp, new String[]{}, true); + return clp.parser; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + protected abstract CommandLineProgram createCommandLineProgram(); + + /** + * Gets the javadocs associated with field name in classDoc. Throws a + * runtime exception if this proves impossible. + * + * @param classDoc + * @param name + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name) { + return getFieldDoc(classDoc, name, true); + } + + /** + * Recursive helper routine to getFieldDoc() + * + * @param classDoc + * @param name + * @param primary + * @return + */ + private FieldDoc getFieldDoc(ClassDoc classDoc, String name, boolean primary) { + //System.out.printf("Looking for %s in %s%n", name, classDoc.name()); + for (FieldDoc fieldDoc : classDoc.fields(false)) { + //System.out.printf("fieldDoc " + fieldDoc + " name " + fieldDoc.name()); + if (fieldDoc.name().equals(name)) + return fieldDoc; + + Field field = DocletUtils.getFieldForFieldDoc(fieldDoc); + if (field == null) + throw new RuntimeException("Could not find the field corresponding to " + fieldDoc + ", presumably because the field is inaccessible"); + if (field.isAnnotationPresent(ArgumentCollection.class)) { + ClassDoc typeDoc = getRootDoc().classNamed(fieldDoc.type().qualifiedTypeName()); + if (typeDoc == null) + throw new ReviewedGATKException("Tried to get javadocs for ArgumentCollection field " + fieldDoc + " but could't find the class in the RootDoc"); + else { + FieldDoc result = getFieldDoc(typeDoc, name, false); + if (result != null) + return result; + // else keep searching + } + } + } + + // if we didn't find it here, wander up to the superclass to find the field + if (classDoc.superclass() != null) { + return getFieldDoc(classDoc.superclass(), name, false); + } + + if (primary) + throw new RuntimeException("No field found for expected field " + name); + else + return null; + } + + /** + * Returns a Pair of (main, synonym) names for argument with fullName s1 and + * shortName s2. + * + * Previously we had it so the main name was selected to be the longest of the two, provided + * it didn't exceed MAX_DISPLAY_NAME, in which case the shorter was taken. But we now disable + * the length-based name rearrangement in order to maintain consistency in the GATKDocs table. + * + * This may cause messed up spacing in the CLI-help display but we don't care as much about that + * since more users use the online GATKDocs for looking up arguments. + * + * @param s1 the short argument name without -, or null if not provided + * @param s2 the long argument name without --, or null if not provided + * @return A pair of fully qualified names (with - or --) for the argument. The first + * element is the primary display name while the second (potentially null) is a + * synonymous name. + */ + Pair displayNames(String s1, String s2) { + s1 = s1 == null ? null : "-" + s1; + s2 = s2 == null ? null : "--" + s2; + + if (s1 == null) return new Pair(s2, null); + if (s2 == null) return new Pair(s1, null); + + return new Pair(s2, s1); + } + + /** + * Returns a human readable string that describes the Type type of a GATK argument. + *

+ * This will include parameterized types, so that Set{T} shows up as Set(T) and not + * just Set in the docs. + * + * @param type + * @return + */ + protected String argumentTypeString(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType) type; + List subs = new ArrayList(); + for (Type actualType : parameterizedType.getActualTypeArguments()) + subs.add(argumentTypeString(actualType)); + return argumentTypeString(((ParameterizedType) type).getRawType()) + "[" + Utils.join(",", subs) + "]"; + } else if (type instanceof GenericArrayType) { + return argumentTypeString(((GenericArrayType) type).getGenericComponentType()) + "[]"; + } else if (type instanceof WildcardType) { + throw new RuntimeException("We don't support wildcards in arguments: " + type); + } else if (type instanceof Class) { + return ((Class) type).getSimpleName(); + } else { + throw new GATKException("Unknown type: " + type); + } + } + + /** + * Helper routine that returns the Feature.class required by a RodBinding, + * either T for RodBinding{T} or List{RodBinding{T}}. Returns null if + * the Type doesn't fit either model. + * + * @param type + * @return + */ + protected Class getFeatureTypeIfPossible(Type type) { + if (type instanceof ParameterizedType) { + ParameterizedType paramType = (ParameterizedType) type; + if (RodBinding.class.isAssignableFrom((Class) paramType.getRawType())) { + return (Class) JVMUtils.getParameterizedTypeClass(type); + } else { + for (Type paramtype : paramType.getActualTypeArguments()) { + Class x = getFeatureTypeIfPossible(paramtype); + if (x != null) + return x; + } + } + } + + return null; + } + + /** + * High-level entry point for creating a FreeMarker map describing the GATK argument + * source with definition def, with associated javadoc fieldDoc. + * + * @param fieldDoc + * @param source + * @param def + * @return a non-null Map binding argument keys with their values + */ + protected Map docForArgument(FieldDoc fieldDoc, ArgumentSource source, ArgumentDefinition def) { + Map root = new HashMap(); + Pair names = displayNames(def.shortName, def.fullName); + + root.put("name", names.getFirst()); + + if (names.getSecond() != null) { + root.put("synonyms", names.getSecond()); + } else { + root.put("synonyms", "NA"); + } + + root.put("required", def.required ? "yes" : "no"); + + // type of the field + root.put("type", argumentTypeString(source.field.getGenericType())); + + Class featureClass = getFeatureTypeIfPossible(source.field.getGenericType()); + if (featureClass != null) { + // deal with the allowable types + FeatureManager manager = new FeatureManager(); + List rodTypes = new ArrayList(); + for (FeatureManager.FeatureDescriptor descriptor : manager.getByFeature(featureClass)) { + rodTypes.add(String.format("%s", + GATKDocUtils.phpFilenameForClass(descriptor.getCodecClass()), + descriptor.getName())); + } + + root.put("rodTypes", Utils.join(", ", rodTypes)); + } else { + root.put("rodTypes", "NA"); + } + + // summary and fulltext + root.put("summary", def.doc != null ? def.doc : ""); + root.put("fulltext", fieldDoc.commentText()); + + // What are our enum options? + if (def.validOptions != null) { + root.put("options", docForEnumArgument(source.field.getType())); + } else { + root.put("options", new ArrayList()); + } + // general attributes + List attributes = new ArrayList(); + if (def.required) attributes.add("required"); + if (source.isDeprecated()) attributes.add("deprecated"); + if (attributes.size() > 0) { + root.put("attributes", Utils.join(", ", attributes)); + } else { + root.put("attributes", "NA"); + } + return root; + } + + /** + * Helper routine that provides a FreeMarker map for an enumClass, grabbing the + * values of the enum and their associated javadoc documentation. + * + * @param enumClass + * @return + */ + @Requires("enumClass.isEnum()") + private List> docForEnumArgument(final Class enumClass) { + final ClassDoc doc = this.getDoclet().getClassDocForClass(enumClass); + if ( doc == null ) + throw new RuntimeException("Tried to get docs for enum " + enumClass + " but got null instead"); + + final Set enumConstantFieldNames = enumConstantsNames(enumClass); + + final List> bindings = new ArrayList>(); + for (final FieldDoc fieldDoc : doc.fields(false)) { + if (enumConstantFieldNames.contains(fieldDoc.name()) ) + bindings.add( + new HashMap() {{ + put("name", fieldDoc.name()); + put("summary", fieldDoc.commentText()); + }}); + } + + return bindings; + } + + /** + * Returns the name of the fields that are enum constants according to reflection + * + * @return a non-null set of fields that are enum constants + */ + private Set enumConstantsNames(final Class enumClass) { + final Set enumConstantFieldNames = new HashSet(); + + for ( final Field field : enumClass.getFields() ) { + if ( field.isEnumConstant() ) + enumConstantFieldNames.add(field.getName()); + } + + return enumConstantFieldNames; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java new file mode 100644 index 000000000..b72811c00 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpConstants.java @@ -0,0 +1,82 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.help; + +public class HelpConstants { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/tooldocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + /** + * Arguments for parallelism options + */ + public final static String ARG_TREEREDUCIBLE = "-nt"; + public final static String ARG_NANOSCHEDULABLE = "-nct"; + + /** + * Definition of the group names / categories of tools. + * The names get parsed to make supercategories in the doc index, + * so be careful when making big changes -- see GATKDoclet.java toMap() + */ + public final static String DOCS_CAT_DATA = "Sequence Data Processing Tools"; + public final static String DOCS_CAT_QC = "Diagnostics and Quality Control Tools"; + public final static String DOCS_CAT_ENGINE = "Engine Parameters (available to all tools)"; + public final static String DOCS_CAT_RF = "Read Filters"; + public final static String DOCS_CAT_REFUTILS = "Reference Utilities"; + public final static String DOCS_CAT_RODCODECS = "ROD Codecs"; + public final static String DOCS_CAT_USRERR = "User Exceptions (DevZone)"; + public final static String DOCS_CAT_VALIDATION = "Validation Utilities"; + public final static String DOCS_CAT_ANNOT = "Variant Annotations"; + public final static String DOCS_CAT_VARDISC = "Variant Discovery Tools"; + public final static String DOCS_CAT_VARMANIP = "Variant Evaluation and Manipulation Tools"; + public final static String DOCS_CAT_TOY = "Toy Walkers (DevZone)"; + public final static String DOCS_CAT_HELPUTILS = "Help Utilities"; + + public static String forumPost(String post) { + return GATK_FORUM_URL + post; + } + + /** + * Go-to developer name codes for tracking and display purposes. Only current team members should be in this list. + * When someone leaves, their charges should be redistributed. The actual string should be closest to the dev's + * abbreviated name or two/three-letter nickname as possible. The code can be something else if necessary to + * disambiguate from other variable. + */ + public final static String MC = "MC"; // Mauricio Carneiro + public final static String EB = "EB"; // Eric Banks + public final static String RP = "RP"; // Ryan Poplin + public final static String GVDA = "GG"; // Geraldine Van der Auwera + public final static String VRR = "VRR"; // Valentin Ruano-Rubio + public final static String ALM = "ALM"; // Ami Levy-Moonshine + public final static String BH = "BH"; // Bertrand Haas + public final static String JoT = "JT"; // Joel Thibault + public final static String DR = "DR"; // David Roazen + public final static String KS = "KS"; // Khalid Shakir + + +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpFormatter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpFormatter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/help/HelpFormatter.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/HelpFormatter.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ResourceBundleExtractorDoclet.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ResourceBundleExtractorDoclet.java new file mode 100644 index 000000000..f225f4197 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/help/ResourceBundleExtractorDoclet.java @@ -0,0 +1,240 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.help; + +import com.sun.javadoc.*; +import org.broadinstitute.gatk.utils.Utils; + +import java.io.*; +import java.util.*; + +/** + * Extracts certain types of javadoc (specifically package and class descriptions) and makes them available + * to applications at runtime. + * + * @author mhanna + * @version 0.1 + */ +public class ResourceBundleExtractorDoclet { + /** + * Taglet for the particular version number. + */ + public static final String VERSION_TAGLET_NAME = "version"; + public static final String SUMMARY_TAGLET_NAME = "help.summary"; + public static final String DESCRIPTION_TAGLET_NAME = "help.description"; + + /** + * Maintains a collection of resources in memory as they're accumulated. + */ + protected final Properties resourceText = new Properties(); + + /** + * Maintains a collection of classes that should really be documented. + */ + protected final Set undocumentedClasses = new HashSet(); + + protected String buildTimestamp = null, absoluteVersion = null; + + /** + * Extracts the contents of certain types of javadoc and adds them to an XML file. + * @param rootDoc The documentation root. + * @return Whether the JavaDoc run succeeded. + * @throws IOException if output can't be written. + */ + public static boolean start(RootDoc rootDoc) throws IOException { + ResourceBundleExtractorDoclet doclet = new ResourceBundleExtractorDoclet(); + PrintStream out = doclet.loadData(rootDoc, true); + doclet.processDocs(rootDoc, out); + return true; + } + + protected PrintStream loadData(RootDoc rootDoc, boolean overwriteResourcesFile) { + PrintStream out = System.out; + + for(String[] options: rootDoc.options()) { + if(options[0].equals("-out")) { + try { + loadExistingResourceFile(options[1], rootDoc); + if ( overwriteResourcesFile ) + out = new PrintStream(options[1]); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + if(options[0].equals("-build-timestamp")) + buildTimestamp = options[1]; + if (options[0].equals("-absolute-version")) + absoluteVersion = options[1]; + } + + resourceText.setProperty("build.timestamp",buildTimestamp); + return out; + } + + protected void processDocs(RootDoc rootDoc, PrintStream out) { + // Cache packages as we see them, since there's no direct way to iterate over packages. + Set packages = new HashSet(); + + for(ClassDoc currentClass: rootDoc.classes()) { + PackageDoc containingPackage = currentClass.containingPackage(); + packages.add(containingPackage); + + if(isRequiredJavadocMissing(currentClass) && shouldDocument(currentClass)) + undocumentedClasses.add(currentClass.name()); + + renderHelpText(DocletUtils.getClassName(currentClass),currentClass); + } + + for(PackageDoc currentPackage: packages) + renderHelpText(currentPackage.name(),currentPackage); + + try { + resourceText.store(out,"Strings displayed by the GATK help system"); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + + // ASCII codes for making text blink + final String blink = "\u001B\u005B\u0035\u006D"; + final String reset = "\u001B\u005B\u006D"; + + if(undocumentedClasses.size() > 0) + Utils.warnUser(String.format("The following are currently undocumented: %s%s%s", blink, Utils.join(" ", undocumentedClasses), reset)); + } + + /** + * Validate the given options against options supported by this doclet. + * @param option Option to validate. + * @return Number of potential parameters; 0 if not supported. + */ + public static int optionLength(String option) { + if(option.equals("-build-timestamp") || option.equals("-out") || option.equals("-absolute-version") ) { + return 2; + } + return 0; + } + + /** + * Attempts to load the contents of the resource file named by resourceFileName into + * our in-memory resource collection resourceText. If the resource file doesn't exist, + * prints a notice to the user but does not throw an exception back to the calling method, + * since we'll just create a new resource file from scratch in that case. + * @param resourceFileName name of the resource file to attempt to load. + * @param rootDoc the documentation root. + * @throws IOException if there is an I/O-related error other than FileNotFoundException + * while attempting to read the resource file. + */ + private void loadExistingResourceFile( String resourceFileName, RootDoc rootDoc ) throws IOException { + try { + BufferedReader resourceFile = new BufferedReader(new FileReader(resourceFileName)); + try { + resourceText.load(resourceFile); + } + finally { + resourceFile.close(); + } + } + catch ( FileNotFoundException e ) { + rootDoc.printNotice("Resource file not found -- generating a new one from scratch."); + } + } + + /** + * Determine whether a given class should be documented. + * @param classDoc the type of the given class. + * @return True if the class should be documented. False otherwise. + */ + protected static boolean shouldDocument(ClassDoc classDoc) { + // TODO: Code duplication with GATKDoclet, including DocletUtils.getClassForDoc(). + // TODO: Refactor common methods into DocletUtils, and possibly just use DocumentGATKFeatureObjects. + final Class docClass; + try { + docClass = (Class) DocletUtils.getClassForDoc(classDoc); + } catch (ClassNotFoundException e) { + return false; + } catch (NoClassDefFoundError e) { + return false; + } catch (UnsatisfiedLinkError e) { + return false; // naughty BWA bindings + } + final DocumentedGATKFeature f = docClass.getAnnotation(DocumentedGATKFeature.class); + return f != null && f.enable(); + } + + /** + * Is the javadoc for the given class missing? + * @param classDoc Class for which to inspect the JavaDoc. + * @return True if the JavaDoc is missing. False otherwise. + */ + private static boolean isRequiredJavadocMissing(ClassDoc classDoc) { + return classDoc.commentText().length() == 0 || classDoc.commentText().contains("Created by IntelliJ"); + } + + /** + * Renders all the help text required for a given name. + * @param elementName element name to use as the key + * @param element Doc element to process. + */ + private void renderHelpText(String elementName, Doc element) { + StringBuilder summaryBuilder = new StringBuilder(); + for(Tag tag: element.firstSentenceTags()) + summaryBuilder.append(tag.text()); + String summary = summaryBuilder.toString(); + String description = element.commentText(); + + // this might seem unnecessary, but the GATK command line program uses this tag to determine the version when running + if(absoluteVersion != null) + resourceText.setProperty(String.format("%s.%s",elementName,VERSION_TAGLET_NAME),absoluteVersion); + + // Write out an alternate element summary, if exists. + resourceText.setProperty(String.format("%s.%s",elementName,SUMMARY_TAGLET_NAME),formatText(summary)); + + // Write out an alternate description, if present. + resourceText.setProperty(String.format("%s.%s",elementName,DESCRIPTION_TAGLET_NAME),formatText(description)); + } + + /** + * Format text for consumption by the properties file. + * @param text Text to format. + * @return Formatted text; string trimmed, newlines removed. + */ + private static String formatText(String text) { + Scanner scanner = new Scanner(text); + StringBuilder output = new StringBuilder(); + + while(scanner.hasNextLine()) { + if(output.length() > 0) + output.append(' '); + output.append(scanner.nextLine().trim()); + } + + return output.toString(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/instrumentation/Sizeof.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/instrumentation/Sizeof.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/instrumentation/Sizeof.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/instrumentation/Sizeof.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalMergingRule.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalMergingRule.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalMergingRule.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalMergingRule.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalSetRule.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalSetRule.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalSetRule.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalSetRule.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java new file mode 100644 index 000000000..15a11023e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/interval/IntervalUtils.java @@ -0,0 +1,895 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.interval; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.util.Interval; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.SAMFileHeader; +import org.apache.log4j.Logger; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.IntervalArgumentCollection; +import org.broadinstitute.gatk.utils.commandline.IntervalBinding; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.text.XReadLines; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * Parse text representations of interval strings that + * can appear in GATK-based applications. + * + * @author mhanna + * @version 0.1 + */ +public class IntervalUtils { + private static Logger logger = Logger.getLogger(IntervalUtils.class); + + /** + * Turns a set of strings describing intervals into a parsed set of intervals. Valid string elements can be files, + * intervals in samtools notation (chrA:B-C), or some combination of the above separated by semicolons. Additionally, + * 'all' can be supplied to indicate all possible intervals, but 'all' must be exclusive of all other interval + * specifications. + * + * @param parser Genome loc parser. + * @param argList A list of strings containing interval data. + * @return an unsorted, unmerged representation of the given intervals. Null is used to indicate that all intervals should be used. + */ + public static List parseIntervalArguments(GenomeLocParser parser, List argList) { + List rawIntervals = new ArrayList(); // running list of raw GenomeLocs + + if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to + // ensure that the arg list isn't null before looping. + for (String argument : argList) { + rawIntervals.addAll(parseIntervalArguments(parser, argument)); + } + } + + return rawIntervals; + } + + public static List parseIntervalArguments(GenomeLocParser parser, String arg) { + List rawIntervals = new ArrayList(); // running list of raw GenomeLocs + + if ( arg.indexOf(';') != -1 ) { + throw new UserException.BadArgumentValue("-L " + arg, "The legacy -L \"interval1;interval2\" syntax " + + "is no longer supported. Please use one -L argument for each " + + "interval or an interval file instead."); + } + + // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. + if (isUnmapped(arg)) + rawIntervals.add(GenomeLoc.UNMAPPED); + // if it's a file, add items to raw interval list + else if (isIntervalFile(arg)) { + try { + rawIntervals.addAll(intervalFileToList(parser, arg)); + } + catch ( UserException.MalformedGenomeLoc e ) { + throw e; + } + catch ( Exception e ) { + throw new UserException.MalformedFile(arg, "Interval file could not be parsed in any supported format.", e); + } + } + // otherwise treat as an interval -> parse and add to raw interval list + else { + rawIntervals.add(parser.parseGenomeLoc(arg)); + } + + return rawIntervals; + } + + /** + * Read a file of genome locations to process. The file may be in BED, Picard, + * or GATK interval format. + * + * @param glParser GenomeLocParser + * @param file_name interval file + * @return List List of Genome Locs that have been parsed from file + */ + public static List intervalFileToList(final GenomeLocParser glParser, final String file_name) { + // try to open file + File inputFile = new File(file_name); + List ret = new ArrayList(); + + // case: BED file + if ( file_name.toUpperCase().endsWith(".BED") ) { + // this is now supported in Tribble + throw new ReviewedGATKException("BED files must be parsed through Tribble; parsing them as intervals through the GATK engine is no longer supported"); + } + else { + /** + * IF not a BED file: + * first try to read it as a Picard interval file since that's well structured + * we'll fail quickly if it's not a valid file. + */ + boolean isPicardInterval = false; + try { + // Note: Picard will skip over intervals with contigs not in the sequence dictionary + IntervalList il = IntervalList.fromFile(inputFile); + isPicardInterval = true; + + int nInvalidIntervals = 0; + for (Interval interval : il.getIntervals()) { + if ( glParser.isValidGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)) + ret.add(glParser.createGenomeLoc(interval.getSequence(), interval.getStart(), interval.getEnd(), true)); + else { + nInvalidIntervals++; + } + } + if ( nInvalidIntervals > 0 ) + logger.warn("Ignoring " + nInvalidIntervals + " invalid intervals from " + inputFile); + } + + // if that didn't work, try parsing file as a GATK interval file + catch (Exception e) { + if ( isPicardInterval ) // definitely a picard file, but we failed to parse + throw new UserException.CouldNotReadInputFile(inputFile, e); + else { + try { + XReadLines reader = new XReadLines(new File(file_name)); + for(String line: reader) { + if ( line.trim().length() > 0 ) { + ret.add(glParser.parseGenomeLoc(line)); + } + } + reader.close(); + } + catch (IOException e2) { + throw new UserException.CouldNotReadInputFile(inputFile, e2); + } + } + } + } + + return ret; + } + + /** + * Returns true if the interval string is the "unmapped" interval + * @param interval Interval to check + * @return true if the interval string is the "unmapped" interval + */ + public static boolean isUnmapped(String interval) { + return (interval != null && interval.trim().toLowerCase().equals("unmapped")); + } + + /** + * merge two interval lists, using an interval set rule + * @param setOne a list of genomeLocs, in order (cannot be NULL) + * @param setTwo a list of genomeLocs, also in order (cannot be NULL) + * @param rule the rule to use for merging, i.e. union, intersection, etc + * @return a list, correctly merged using the specified rule + */ + public static List mergeListsBySetOperator(List setOne, List setTwo, IntervalSetRule rule) { + // shortcut, if either set is zero, return the other set + if (setOne == null || setOne.size() == 0 || setTwo == null || setTwo.size() == 0) + return Collections.unmodifiableList((setOne == null || setOne.size() == 0) ? setTwo : setOne); + + // our master list, since we can't guarantee removal time in a generic list + LinkedList retList = new LinkedList(); + + // if we're set to UNION, just add them all + if (rule == null || rule == IntervalSetRule.UNION) { + retList.addAll(setOne); + retList.addAll(setTwo); + return Collections.unmodifiableList(retList); + } + + // else we're INTERSECTION, create two indexes into the lists + int iOne = 0; + int iTwo = 0; + + // merge the second into the first using the rule + while (iTwo < setTwo.size() && iOne < setOne.size()) + // if the first list is ahead, drop items off the second until we overlap + if (setTwo.get(iTwo).isBefore(setOne.get(iOne))) + iTwo++; + // if the second is ahead, drop intervals off the first until we overlap + else if (setOne.get(iOne).isBefore(setTwo.get(iTwo))) + iOne++; + // we overlap, intersect the two intervals and add the result. Then remove the interval that ends first. + else { + retList.add(setOne.get(iOne).intersect(setTwo.get(iTwo))); + if (setOne.get(iOne).getStop() < setTwo.get(iTwo).getStop()) iOne++; + else iTwo++; + } + + //if we have an empty list, throw an exception. If they specified intersection and there are no items, this is bad. + if (retList.size() == 0) + throw new UserException.BadInput("The INTERSECTION of your -L options produced no intervals."); + + // we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have + return Collections.unmodifiableList(retList); + } + + /** + * Sorts and merges an interval list. Multiple techniques are available for merging: ALL, which combines + * all overlapping and abutting intervals into an interval that spans the union of all covered bases, and + * OVERLAPPING_ONLY, which unions overlapping intervals but keeps abutting intervals separate. + * + * @param parser Genome loc parser for the intervals. + * @param intervals A collection of intervals to merge. + * @param mergingRule A descriptor for the type of merging to perform. + * @return A sorted, merged version of the intervals passed in. + */ + public static GenomeLocSortedSet sortAndMergeIntervals(GenomeLocParser parser, List intervals, IntervalMergingRule mergingRule) { + // Make a copy of the (potentially unmodifiable) list to be sorted + intervals = new ArrayList(intervals); + // sort raw interval list + Collections.sort(intervals); + // now merge raw interval list + intervals = mergeIntervalLocations(intervals, mergingRule); + + return GenomeLocSortedSet.createSetFromList(parser,intervals); + } + + /** + * computes whether the test interval list is equivalent to master. To be equivalent, test must + * contain GenomeLocs covering every base in master, exactly once. Note that this algorithm + * assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but + * rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm + * doesn't assume that test has discontinuous genomelocs. + * + * Returns a null string if there are no differences, otherwise returns a string describing the difference + * (useful for UnitTests). Assumes both lists are sorted + * + * @param masterArg sorted master genome locs + * @param testArg sorted test genome locs + * @return null string if there are no difference, otherwise a string describing the difference + */ + public static String equateIntervals(List masterArg, List testArg) { + LinkedList master = new LinkedList(masterArg); + LinkedList test = new LinkedList(testArg); + + while ( ! master.isEmpty() ) { // there's still unchecked bases in master + final GenomeLoc masterHead = master.pop(); + final GenomeLoc testHead = test.pop(); + + if ( testHead.overlapsP(masterHead) ) { + // remove the parts of test that overlap master, and push the remaining + // parts onto master for further comparison. + for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) { + master.push(masterPart); + } + } else { + // testHead is incompatible with masterHead, so we must have extra bases in testHead + // that aren't in master + return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead; + } + } + + if ( test.isEmpty() ) // everything is equal + return null; // no differences + else + return "Remaining elements found in test: first=" + test.peek(); + } + + + /** + * Check if string argument was intented as a file + * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. + * @param str token to identify as a filename. + * @return true if the token looks like a filename, or false otherwise. + */ + public static boolean isIntervalFile(String str) { + return isIntervalFile(str, true); + } + + /** + * Check if string argument was intented as a file + * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. + * @param str token to identify as a filename. + * @param checkExists if true throws an exception if the file doesn't exist. + * @return true if the token looks like a filename, or false otherwise. + */ + public static boolean isIntervalFile(String str, boolean checkExists) { + // should we define list of file extensions as a public array somewhere? + // is regex or endsiwth better? + File file = new File(str); + if (str.toUpperCase().endsWith(".BED") || str.toUpperCase().endsWith(".LIST") || + str.toUpperCase().endsWith(".PICARD") || str.toUpperCase().endsWith(".INTERVAL_LIST") + || str.toUpperCase().endsWith(".INTERVALS")) { + if (!checkExists) + return true; + else if (file.exists()) + return true; + else + throw new UserException.CouldNotReadInputFile(file, "The interval file does not exist."); + } + + if(file.exists()) + throw new UserException.CouldNotReadInputFile(file, String.format("The interval file %s does not have one of " + + "the supported extensions (.bed, .list, .picard, .interval_list, or .intervals). " + + "Please rename your file with the appropriate extension. If %s is NOT supposed to be a file, " + + "please move or rename the file at location %s", str, str, file.getAbsolutePath())); + + else return false; + } + + /** + * Returns a map of contig names with their sizes. + * @param reference The reference for the intervals. + * @return A map of contig names with their sizes. + */ + public static Map getContigSizes(File reference) { + final ReferenceSequenceFile referenceSequenceFile = createReference(reference); + List locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSequenceFile.getSequenceDictionary()).toList(); + Map lengths = new LinkedHashMap(); + for (GenomeLoc loc: locs) + lengths.put(loc.getContig(), loc.size()); + return lengths; + } + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param locs The genome locs to split. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterContigIntervals(SAMFileHeader fileHeader, List locs, List scatterParts) { + + // Contract: must divide locs up so that each of scatterParts gets a sublist such that: + // (a) all locs concerning a particular contig go to the same part + // (b) locs are not split or combined, and remain in the same order (so scatterParts[0] + ... + scatterParts[n] == locs) + + // Locs are already sorted. + + long totalBases = 0; + for(GenomeLoc loc : locs) + totalBases += loc.size(); + + long idealBasesPerPart = totalBases / scatterParts.size(); + if(idealBasesPerPart == 0) + throw new UserException.BadInput(String.format("Genome region is too short (%d bases) to split into %d parts", totalBases, scatterParts.size())); + + // Find the indices in locs where we switch from one contig to the next. + ArrayList contigStartLocs = new ArrayList(); + String prevContig = null; + + for(int i = 0; i < locs.size(); ++i) { + + GenomeLoc loc = locs.get(i); + if(prevContig == null || !loc.getContig().equals(prevContig)) + contigStartLocs.add(i); + prevContig = loc.getContig(); + + } + + if(contigStartLocs.size() < scatterParts.size()) + throw new UserException.BadInput(String.format("Input genome region has too few contigs (%d) to split into %d parts", contigStartLocs.size(), scatterParts.size())); + + long thisPartBases = 0; + int partIdx = 0; + IntervalList outList = new IntervalList(fileHeader); + + for(int i = 0; i < locs.size(); ++i) { + + GenomeLoc loc = locs.get(i); + thisPartBases += loc.getStop() - loc.getStart(); + + outList.add(toInterval(loc, i)); + + boolean partMustStop = false; + + if(partIdx < (scatterParts.size() - 1)) { + + // If there are n contigs and n parts remaining then we must split here, + // otherwise we will run out of contigs. + + int nextPart = partIdx + 1; + int nextPartMustStartBy = contigStartLocs.get(nextPart + (contigStartLocs.size() - scatterParts.size())); + if(i + 1 == nextPartMustStartBy) + partMustStop = true; + + } + else if(i == locs.size() - 1) { + + // We're done! Write the last scatter file. + partMustStop = true; + + } + + if(partMustStop || thisPartBases > idealBasesPerPart) { + + // Ideally we would split here. However, we must make sure to do so + // on a contig boundary. Test always passes with partMustStop == true + // since that indicates we're at a contig boundary. + + GenomeLoc nextLoc = null; + if((i + 1) < locs.size()) + nextLoc = locs.get(i+1); + + if(nextLoc == null || !nextLoc.getContig().equals(loc.getContig())) { + + // Write out this part: + outList.write(scatterParts.get(partIdx)); + + // Reset. If this part ran long, leave the excess in thisPartBases + // and the next will be a little shorter to compensate. + outList = new IntervalList(fileHeader); + thisPartBases -= idealBasesPerPart; + ++partIdx; + + } + + } + + } + + } + + /** + * Splits an interval list into multiple sublists. + * @param locs The genome locs to split. + * @param splits The stop points for the genome locs returned by splitFixedIntervals. + * @return A list of lists of genome locs, split according to splits + */ + public static List> splitIntervalsToSubLists(List locs, List splits) { + int start = 0; + List> sublists = new ArrayList>(splits.size()); + for (Integer stop: splits) { + List curList = new ArrayList(); + for (int i = start; i < stop; i++) + curList.add(locs.get(i)); + start = stop; + sublists.add(curList); + } + + return sublists; + } + + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param splits Pre-divided genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + + int fileIndex = 0; + int locIndex = 1; + for (final List split : splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (final GenomeLoc loc : split) + intervalList.add(toInterval(loc, locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); + } + } + + /** + * Splits the genome locs up by size. + * @param locs Genome locs to split. + * @param numParts Number of parts to split the locs into. + * @return The stop points to split the genome locs. + */ + public static List> splitFixedIntervals(List locs, int numParts) { + if (locs.size() < numParts) + throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); + final long locsSize = intervalSize(locs); + final List splitPoints = new ArrayList(); + addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); + Collections.sort(splitPoints); + splitPoints.add(locs.size()); + return splitIntervalsToSubLists(locs, splitPoints); + } + + @Requires({"locs != null", "numParts > 0"}) + @Ensures("result != null") + public static List> splitLocusIntervals(List locs, int numParts) { + // the ideal size of each split + final long bp = IntervalUtils.intervalSize(locs); + final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1); + + // algorithm: + // split = () + // set size = 0 + // pop the head H off locs. + // If size + size(H) < splitSize: + // add H to split, continue + // If size + size(H) == splitSize: + // done with split, put in splits, restart + // if size + size(H) > splitSize: + // cut H into two pieces, first of which has splitSize - size bp + // push both pieces onto locs, continue + // The last split is special -- when you have only one split left, it gets all of the remaining locs + // to deal with rounding issues + final List> splits = new ArrayList>(numParts); + + LinkedList locsLinkedList = new LinkedList(locs); + while ( ! locsLinkedList.isEmpty() ) { + if ( splits.size() + 1 == numParts ) { + // the last one gets all of the remaining parts + splits.add(new ArrayList(locsLinkedList)); + locsLinkedList.clear(); + } else { + final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize); + splits.add(one.split); + locsLinkedList = one.remaining; + } + } + + return splits; + } + + @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) + @Ensures({"result != null"}) + static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + final List split = new ArrayList(); + long size = 0; + + while ( ! remaining.isEmpty() ) { + GenomeLoc head = remaining.pop(); + final long newSize = size + head.size(); + + if ( newSize == idealSplitSize ) { + split.add(head); + break; // we are done + } else if ( newSize > idealSplitSize ) { + final long remainingBp = idealSplitSize - size; + final long cutPoint = head.getStart() + remainingBp; + GenomeLoc[] parts = head.split((int)cutPoint); + remaining.push(parts[1]); + remaining.push(parts[0]); + // when we go around, head.size' = idealSplitSize - size + // so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize + } else { + split.add(head); + size = newSize; + } + } + + return new SplitLocusRecursive(split, remaining); + } + + /** + * Setup the intervals to be processed + */ + public static GenomeLocSortedSet parseIntervalBindings( + final ReferenceSequenceFile referenceSequenceFile, + final List> intervals, + final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, + final List> excludeIntervals) { + + Pair includeExcludePair = parseIntervalBindingsPair( + referenceSequenceFile, intervals, intervalSetRule, intervalMergingRule, intervalPadding, excludeIntervals); + + GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + + if (excludeSortedSet != null) { + return includeSortedSet.subtractRegions(excludeSortedSet); + } else { + return includeSortedSet; + } + } + + public static GenomeLocSortedSet parseIntervalArguments(final ReferenceSequenceFile referenceSequenceFile, IntervalArgumentCollection argCollection) { + GenomeLocSortedSet intervals = null; + + // return if no interval arguments at all + if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) + return intervals; + + // Note that the use of '-L all' is no longer supported. + + // if include argument isn't given, create new set of all possible intervals + + final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair( + referenceSequenceFile, + argCollection.intervals, + argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, + argCollection.excludeIntervals); + + final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); + final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); + + // if no exclude arguments, can return parseIntervalArguments directly + if ( excludeSortedSet == null ) + intervals = includeSortedSet; + + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + else { + intervals = includeSortedSet.subtractRegions(excludeSortedSet); + + // logging messages only printed when exclude (-XL) arguments are given + final long toPruneSize = includeSortedSet.coveredSize(); + final long toExcludeSize = excludeSortedSet.coveredSize(); + final long intervalSize = intervals.coveredSize(); + logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); + logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", + toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); + } + + logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize())); + return intervals; + } + + public static Pair parseIntervalBindingsPair( + final ReferenceSequenceFile referenceSequenceFile, + final List> intervals, + final IntervalSetRule intervalSetRule, final IntervalMergingRule intervalMergingRule, final int intervalPadding, + final List> excludeIntervals) { + GenomeLocParser genomeLocParser = new GenomeLocParser(referenceSequenceFile); + + // if include argument isn't given, create new set of all possible intervals + GenomeLocSortedSet includeSortedSet = ((intervals == null || intervals.size() == 0) ? + GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSequenceFile.getSequenceDictionary()) : + loadIntervals(intervals, intervalSetRule, intervalMergingRule, intervalPadding, genomeLocParser)); + + GenomeLocSortedSet excludeSortedSet = null; + if (excludeIntervals != null && excludeIntervals.size() > 0) { + excludeSortedSet = loadIntervals(excludeIntervals, IntervalSetRule.UNION, intervalMergingRule, 0, genomeLocParser); + } + return new Pair(includeSortedSet, excludeSortedSet); + } + + public static GenomeLocSortedSet loadIntervals( + final List> intervalBindings, + final IntervalSetRule rule, final IntervalMergingRule intervalMergingRule, final int padding, + final GenomeLocParser genomeLocParser) { + List allIntervals = new ArrayList(); + for ( IntervalBinding intervalBinding : intervalBindings) { + @SuppressWarnings("unchecked") + List intervals = intervalBinding.getIntervals(genomeLocParser); + + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + if ( padding > 0 ) { + intervals = getIntervalsWithFlanks(genomeLocParser, intervals, padding); + } + + allIntervals = mergeListsBySetOperator(intervals, allIntervals, rule); + } + + return sortAndMergeIntervals(genomeLocParser, allIntervals, intervalMergingRule); + } + + private final static class SplitLocusRecursive { + final List split; + final LinkedList remaining; + + @Requires({"split != null", "remaining != null"}) + private SplitLocusRecursive(final List split, final LinkedList remaining) { + this.split = split; + this.remaining = remaining; + } + } + + public static List flattenSplitIntervals(List> splits) { + final List locs = new ArrayList(); + for ( final List split : splits ) + locs.addAll(split); + return locs; + } + + private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { + if (numParts < 2) + return; + int halfParts = (numParts + 1) / 2; + Pair splitPoint = getFixedSplit(locs, locsSize, startIndex, stopIndex, halfParts, numParts - halfParts); + int splitIndex = splitPoint.first; + long splitSize = splitPoint.second; + splitPoints.add(splitIndex); + addFixedSplit(splitPoints, locs, splitSize, startIndex, splitIndex, halfParts); + addFixedSplit(splitPoints, locs, locsSize - splitSize, splitIndex, stopIndex, numParts - halfParts); + } + + private static Pair getFixedSplit(List locs, long locsSize, int startIndex, int stopIndex, int minLocs, int maxLocs) { + int splitIndex = startIndex; + long splitSize = 0; + for (int i = 0; i < minLocs; i++) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + long halfSize = locsSize / 2; + while (splitIndex < (stopIndex - maxLocs) && splitSize < halfSize) { + splitSize += locs.get(splitIndex).size(); + splitIndex++; + } + return new Pair(splitIndex, splitSize); + } + + /** + * Converts a GenomeLoc to a picard interval. + * @param loc The GenomeLoc. + * @param locIndex The loc index for use in the file. + * @return The picard interval. + */ + private static htsjdk.samtools.util.Interval toInterval(GenomeLoc loc, int locIndex) { + return new htsjdk.samtools.util.Interval(loc.getContig(), loc.getStart(), loc.getStop(), false, "interval_" + locIndex); + } + + /** + * merge a list of genome locs that may be overlapping, returning the list of unique genomic locations + * + * @param raw the unchecked genome loc list + * @param rule the merging rule we're using + * + * @return the list of merged locations + */ + public static List mergeIntervalLocations(final List raw, IntervalMergingRule rule) { + if (raw.size() <= 1) + return Collections.unmodifiableList(raw); + else { + ArrayList merged = new ArrayList(); + Iterator it = raw.iterator(); + GenomeLoc prev = it.next(); + while (it.hasNext()) { + GenomeLoc curr = it.next(); + if (prev.overlapsP(curr)) { + prev = prev.merge(curr); + } else if (prev.contiguousP(curr) && (rule == null || rule == IntervalMergingRule.ALL)) { + prev = prev.merge(curr); + } else { + merged.add(prev); + prev = curr; + } + } + merged.add(prev); + return Collections.unmodifiableList(merged); + } + } + + public static long intervalSize(final List locs) { + long size = 0; + for ( final GenomeLoc loc : locs ) + size += loc.size(); + return size; + } + + public static void writeFlankingIntervals(File reference, File inputIntervals, File flankingIntervals, int basePairs) { + final ReferenceSequenceFile referenceSequenceFile = createReference(reference); + GenomeLocParser parser = new GenomeLocParser(referenceSequenceFile); + List originalList = intervalFileToList(parser, inputIntervals.getAbsolutePath()); + + if (originalList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "File contains no intervals"); + + List flankingList = getFlankingIntervals(parser, originalList, basePairs); + + if (flankingList.isEmpty()) + throw new UserException.MalformedFile(inputIntervals, "Unable to produce any flanks for the intervals"); + + SAMFileHeader samFileHeader = new SAMFileHeader(); + samFileHeader.setSequenceDictionary(referenceSequenceFile.getSequenceDictionary()); + IntervalList intervalList = new IntervalList(samFileHeader); + int i = 0; + for (GenomeLoc loc: flankingList) + intervalList.add(toInterval(loc, ++i)); + intervalList.write(flankingIntervals); + } + + /** + * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. + * @param parser A genome loc parser for creating the new intervals + * @param locs Original genome locs + * @param basePairs Number of base pairs on each side of loc + * @return The list of intervals between the locs + */ + public static List getFlankingIntervals(final GenomeLocParser parser, final List locs, final int basePairs) { + List sorted = sortAndMergeIntervals(parser, locs, IntervalMergingRule.ALL).toList(); + + if (sorted.size() == 0) + return Collections.emptyList(); + + LinkedHashMap> locsByContig = splitByContig(sorted); + List expanded = new ArrayList(); + for (Map.Entry> contig: locsByContig.entrySet()) { + List contigLocs = contig.getValue(); + int contigLocsSize = contigLocs.size(); + + GenomeLoc startLoc, stopLoc; + + // Create loc at start of the list + startLoc = parser.createGenomeLocAtStart(contigLocs.get(0), basePairs); + if (startLoc != null) + expanded.add(startLoc); + + // Create locs between each loc[i] and loc[i+1] + for (int i = 0; i < contigLocsSize - 1; i++) { + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(i), basePairs); + startLoc = parser.createGenomeLocAtStart(contigLocs.get(i + 1), basePairs); + if (stopLoc.getStop() + 1 >= startLoc.getStart()) { + // NOTE: This is different than GenomeLoc.merge() + // merge() returns a loc which covers the entire range of stop and start, + // possibly returning positions inside loc(i) or loc(i+1) + // We want to make sure that the start of the stopLoc is used, and the stop of the startLoc + GenomeLoc merged = parser.createGenomeLoc( + stopLoc.getContig(), stopLoc.getStart(), startLoc.getStop()); + expanded.add(merged); + } else { + expanded.add(stopLoc); + expanded.add(startLoc); + } + } + + // Create loc at the end of the list + stopLoc = parser.createGenomeLocAtStop(contigLocs.get(contigLocsSize - 1), basePairs); + if (stopLoc != null) + expanded.add(stopLoc); + } + return expanded; + } + + /** + * Returns a list of intervals between the passed int locs. Does not extend UNMAPPED locs. + * @param parser A genome loc parser for creating the new intervals + * @param locs Original genome locs + * @param basePairs Number of base pairs on each side of loc + * @return The list of intervals between the locs + */ + public static List getIntervalsWithFlanks(final GenomeLocParser parser, final List locs, final int basePairs) { + + if (locs.size() == 0) + return Collections.emptyList(); + + final List expanded = new ArrayList(); + for ( final GenomeLoc loc : locs ) { + expanded.add(parser.createPaddedGenomeLoc(loc, basePairs)); + } + + return sortAndMergeIntervals(parser, expanded, IntervalMergingRule.ALL).toList(); + } + + private static ReferenceSequenceFile createReference(final File fastaFile) { + return CachingIndexedFastaSequenceFile.checkAndCreate(fastaFile); + } + + private static LinkedHashMap> splitByContig(List sorted) { + LinkedHashMap> splits = new LinkedHashMap>(); + GenomeLoc last = null; + List contigLocs = null; + for (GenomeLoc loc: sorted) { + if (GenomeLoc.isUnmapped(loc)) + continue; + if (last == null || !last.onSameContig(loc)) { + contigLocs = new ArrayList(); + splits.put(loc.getContig(), contigLocs); + } + contigLocs.add(loc); + last = loc; + } + return splits; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/FileExtension.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/FileExtension.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/FileExtension.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/FileExtension.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/HardThresholdingOutputStream.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/HardThresholdingOutputStream.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/HardThresholdingOutputStream.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/HardThresholdingOutputStream.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/IOUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/IOUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/IOUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/IOUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/Resource.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/Resource.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/io/Resource.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/io/Resource.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/GATKSAMIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/GATKSAMIterator.java new file mode 100644 index 000000000..b91eb2526 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/GATKSAMIterator.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.iterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; +/** + * + * User: aaron + * Date: May 6, 2009 + * Time: 5:30:41 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + +/** + * @author aaron + * @version 1.0 + * @date May 6, 2009 + *

+ * Interface GATKSAMIterator + *

+ * This is the standard interface for all iterators in the GATK package that iterate over SAMRecords + */ +public interface GATKSAMIterator extends CloseableIterator, Iterable { +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/GATKSAMIteratorAdapter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/GATKSAMIteratorAdapter.java new file mode 100644 index 000000000..7507e0897 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/GATKSAMIteratorAdapter.java @@ -0,0 +1,136 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.iterators; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; + +import java.util.Iterator; + +/** + * + * User: aaron + * Date: May 13, 2009 + * Time: 6:33:15 PM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date May 13, 2009 + *

+ * Class GATKSAMIteratorAdapter + *

+ * This class adapts other SAMRecord iterators to the GATKSAMIterator + */ +public class GATKSAMIteratorAdapter { + + public static GATKSAMIterator adapt(Iterator iter) { + return new PrivateStringSAMIterator(iter); + } + + public static GATKSAMIterator adapt(CloseableIterator iter) { + return new PrivateStringSAMCloseableIterator(iter); + } + +} + + +/** + * this class wraps iterators in a GATKSAMIterator, which means just adding the + * methods that implement the iterable<> interface and the close() method from CloseableIterator + */ +class PrivateStringSAMIterator implements GATKSAMIterator { + private Iterator iter = null; + + PrivateStringSAMIterator(Iterator iter) { + this.iter = iter; + } + + public void close() { + // do nothing, we can't close the iterator anyway. + } + + public boolean hasNext() { + return iter.hasNext(); + } + + public SAMRecord next() { + return iter.next(); + } + + public void remove() { + throw new UnsupportedOperationException("GATKSAMIterator's don't allow remove()ing"); + } + + public Iterator iterator() { + return iter; + } +} + + +/** + * this class wraps closeable iterators in a GATKSAMIterator, which means adding the + * methods that implement the iterable<> interface. + */ +class PrivateStringSAMCloseableIterator implements GATKSAMIterator { + private CloseableIterator iter = null; + + PrivateStringSAMCloseableIterator(CloseableIterator iter) { + this.iter = iter; + } + + public void close() { + iter.close(); + } + + public boolean hasNext() { + return iter.hasNext(); + } + + public SAMRecord next() { + return iter.next(); + } + + public void remove() { + throw new UnsupportedOperationException("GATKSAMIterator's don't allow remove()ing"); + } + + public Iterator iterator() { + return iter; + } +} + diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/PushbackIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/PushbackIterator.java new file mode 100644 index 000000000..c4b867b55 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/iterators/PushbackIterator.java @@ -0,0 +1,82 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.iterators; + +import java.util.Iterator; + +public class PushbackIterator implements Iterator, Iterable { + Iterator underlyingIterator; + T pushedElement = null; + + public PushbackIterator(final Iterator underlyingIterator) { + this.underlyingIterator = underlyingIterator; + } + + public boolean hasNext() { + return pushedElement != null || underlyingIterator.hasNext(); + } + + public Iterator iterator() { + return this; + } + + /** + * Retrieves, but does not remove, the head of this iterator. + * @return T the next element in the iterator + */ + public T element() { + T x = next(); + pushback(x); + return x; + } + + /** + * @return the next element in the iteration. + */ + public T next() { + if (pushedElement != null) { + final T ret = pushedElement; + pushedElement = null; + return ret; + } else { + return underlyingIterator.next(); + } + } + + public void pushback(T elt) { + assert(pushedElement == null); + + pushedElement = elt; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + public Iterator getUnderlyingIterator() { + return underlyingIterator; + } +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/JNAUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/JNAUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/JNAUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/JNAUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/LibC.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/LibC.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/LibC.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/clibrary/LibC.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobInfo.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobInfo.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobInfo.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobInfo.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobTemplate.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobTemplate.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobTemplate.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaJobTemplate.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSession.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSession.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSession.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSession.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionFactory.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionFactory.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionFactory.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaa.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaa.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaa.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaa.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBat.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBat.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBat.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBat.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibLsf.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibLsf.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibLsf.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibLsf.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachine.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachine.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachine.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachine.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSDownsamplingInfo.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSDownsamplingInfo.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSDownsamplingInfo.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSDownsamplingInfo.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java new file mode 100644 index 000000000..d4c22a6ad --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LIBSPerformance.java @@ -0,0 +1,193 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SAMReadGroupRecord; +import htsjdk.samtools.SAMRecordIterator; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.commandline.Argument; +import org.broadinstitute.gatk.utils.commandline.CommandLineProgram; +import org.broadinstitute.gatk.utils.commandline.Input; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.gatk.utils.sam.GATKSamRecordFactory; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +/** + * Caliper microbenchmark of fragment pileup + */ +public class LIBSPerformance extends CommandLineProgram { + private static Logger logger = Logger.getLogger(LIBSPerformance.class); + + @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = true) + public File samFile = null; + + @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = true) + public File referenceFile = null; + + @Argument(fullName = "L", shortName = "L", doc = "Query location", required = false) + public String location = null; + + @Argument(fullName = "dt", shortName = "dt", doc = "Enable downsampling", required = false) + public boolean downsample = false; + + @Override + public int execute() throws IOException { + final IndexedFastaSequenceFile reference = new CachingIndexedFastaSequenceFile(referenceFile); + final GenomeLocParser genomeLocParser = new GenomeLocParser(reference); + + final SAMFileReader reader = new SAMFileReader(samFile); + reader.setSAMRecordFactory(new GATKSamRecordFactory()); + + SAMRecordIterator rawIterator; + if ( location == null ) + rawIterator = reader.iterator(); + else { + final GenomeLoc loc = genomeLocParser.parseGenomeLoc(location); + rawIterator = reader.query(loc.getContig(), loc.getStart(), loc.getStop(), false); + } + + final GATKSAMRecordIterator iterator = new GATKSAMRecordIterator(rawIterator); + + final Set samples = new HashSet(); + for ( final SAMReadGroupRecord rg : reader.getFileHeader().getReadGroups() ) + samples.add(rg.getSample()); + + final LIBSDownsamplingInfo ds = new LIBSDownsamplingInfo(downsample, 250); + + final LocusIteratorByState libs = + new LocusIteratorByState( + iterator, + ds, + true, + genomeLocParser, + samples, + false); + + final SimpleTimer timer = new SimpleTimer().start(); + int bp = 0; + double lastElapsed = 0; + while ( libs.hasNext() ) { + AlignmentContext context = libs.next(); + bp++; + if ( timer.getElapsedTime() - lastElapsed > 10 ) { + logger.info(bp + " iterations at " + context.getLocation()); + lastElapsed = timer.getElapsedTime(); + } + } + logger.info(String.format("runtime in seconds: %.2f", timer.getElapsedTime())); + + return 0; + } + +// private void syntheticTests() { +// final int readLength = 101; +// final int nReads = 10000; +// final int locus = 1; +// +// SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); +// final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); +// +// int nIterations = 0; +// for ( final String cigar : Arrays.asList("101M", "50M10I40M", "50M10D40M") ) { +// GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength); +// read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); +// final byte[] quals = new byte[readLength]; +// for ( int i = 0; i < readLength; i++ ) +// quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); +// read.setBaseQualities(quals); +// read.setCigarString(cigar); +// +// for ( int j = 0; j < nReads; j++ ) { +// for ( int i = 0; i < rep; i++ ) { +// switch ( op ) { +// case NEW_STATE: +// { +// final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read); +// while ( alignmentStateMachine.stepForwardOnGenome() != null ) { +// nIterations++; +// } +// } +// break; +//// case OLD_STATE: +//// { +//// final SAMRecordAlignmentState alignmentStateMachine = new SAMRecordAlignmentState(read); +//// while ( alignmentStateMachine.stepForwardOnGenome() != null ) { +//// alignmentStateMachine.getRead(); +//// nIterations++; +//// } +//// } +//// break; +// case NEW_LIBS: +// { +// final List reads = Collections.nCopies(30, read); +// final org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState libs = +// new org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState( +// new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), +// LocusIteratorByStateBaseTest.createTestReadProperties(), +// genomeLocParser, +// LocusIteratorByState.sampleListForSAMWithoutReadGroups()); +// +// while ( libs.hasNext() ) { +// AlignmentContext context = libs.next(); +// } +// } +// } +// } +// } +// } +// +// System.out.printf("iterations %d%n", nIterations); +// } + + /** + * Required main method implementation. + * @param argv Command-line argument text. + * @throws Exception on error. + */ + public static void main(String[] argv) throws Exception { + int returnCode = 0; + try { + LIBSPerformance instance = new LIBSPerformance(); + start(instance, argv); + returnCode = 0; + } catch(Exception ex) { + returnCode = 1; + ex.printStackTrace(); + throw ex; + } finally { + System.exit(returnCode); + } + } + +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIterator.java new file mode 100644 index 000000000..ecbaaf670 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIterator.java @@ -0,0 +1,62 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; + +import java.util.Iterator; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public abstract class LocusIterator implements Iterable, CloseableIterator { + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public abstract boolean hasNext(); + public abstract AlignmentContext next(); + + /** + * Get, if possible, the underlying LocusIteratorByState from this LocusIterator. + * + * @throws UnsupportedOperationException if we don't support this operation + * + * @return a non-null locus iterator by state + */ + public LocusIteratorByState getLIBS() { + throw new UnsupportedOperationException("This locus iterator does not support getting the underlying LocusIteratorByState"); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java new file mode 100644 index 000000000..4857ed595 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByState.java @@ -0,0 +1,457 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecordIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + * + * Produces AlignmentContext objects, that contain ReadBackedPileups of PileupElements. This + * class has its core job of converting an iterator of ordered SAMRecords into those + * RBPs. + * + * There are a few constraints on required and ensured by LIBS: + * + * -- Requires the Iterator to returns reads in coordinate sorted order, consistent with the ordering + * defined by the SAM file format. That that for performance reasons this constraint isn't actually enforced. + * The behavior of LIBS is undefined in the case where the reads are badly ordered. + * -- The reads in the ReadBackedPileup are themselves in the order of appearance of the reads from the iterator. + * That is, the pileup is ordered in a way consistent with the SAM coordinate ordering + * -- Only aligned reads with at least one on-genomic cigar operator are passed on in the pileups. That is, + * unmapped reads or reads that are all insertions (10I) or soft clipped (10S) are not passed on. + * -- LIBS can perform per-sample downsampling of a variety of kinds. + * -- Because of downsampling there's no guarantee that: + * -- A read that could be aligned to a position will actually occur in the pileup (downsampled away) + * -- A read that appears in a previous pileup that could align to a future position will actually occur + * in that pileup. That is, a read might show up at position i but be downsampled away in the pileup at j + * -- LIBS can optionally capture all of the reads that come off the iterator, before any leveling downsampling + * occurs, if requested. This allows users of LIBS to see both a ReadBackedPileup view of the data as well as + * a stream of unique, sorted reads + */ +public final class LocusIteratorByState extends LocusIterator { + /** Indicates that we shouldn't do any downsampling */ + public final static LIBSDownsamplingInfo NO_DOWNSAMPLING = new LIBSDownsamplingInfo(false, -1); + + /** + * our log, which we want to capture anything from this class + */ + private final static Logger logger = Logger.getLogger(LocusIteratorByState.class); + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Used to create new GenomeLocs as needed + */ + private final GenomeLocParser genomeLocParser; + + /** + * A complete list of all samples that may come out of the reads. Must be + * comprehensive. + */ + private final ArrayList samples; + + /** + * The system that maps incoming reads from the iterator to their pileup states + */ + private final ReadStateManager readStates; + + /** + * Should we include reads in the pileup which are aligned with a deletion operator to the reference? + */ + private final boolean includeReadsWithDeletionAtLoci; + + /** + * The next alignment context. A non-null value means that a + * context is waiting from hasNext() for sending off to the next next() call. A null + * value means that either hasNext() has not been called at all or that + * the underlying iterator is exhausted + */ + private AlignmentContext nextAlignmentContext; + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Create a new LocusIteratorByState + * + * @param samIterator the iterator of reads to process into pileups. Reads must be ordered + * according to standard coordinate-sorted BAM conventions + * @param downsamplingMethod information about how to downsample the reads + * @param includeReadsWithDeletionAtLoci Include reads with deletion at loci + * @param keepUniqueReadListInLIBS Keep unique read list in LIBS + * @param genomeLocParser used to create genome locs + * @param samples a complete list of samples present in the read groups for the reads coming from samIterator. + * This is generally just the set of read group sample fields in the SAMFileHeader. This + * list of samples may contain a null element, and all reads without read groups will + * be mapped to this null sample + */ + public LocusIteratorByState(final Iterator samIterator, + final DownsamplingMethod downsamplingMethod, + final boolean includeReadsWithDeletionAtLoci, + final boolean keepUniqueReadListInLIBS, + final GenomeLocParser genomeLocParser, + final Collection samples) { + this(samIterator, + toDownsamplingInfo(downsamplingMethod), + includeReadsWithDeletionAtLoci, + genomeLocParser, + samples, + keepUniqueReadListInLIBS); + } + + /** + * Create a new LocusIteratorByState based on a SAMFileReader using reads in an iterator it + * + * Simple constructor that uses the samples in the reader, doesn't do any downsampling, + * and makes a new GenomeLocParser using the reader. This constructor will be slow(ish) + * if you continually invoke this constructor, but it's easy to make. + * + * @param reader a non-null reader + * @param it an iterator from reader that has the reads we want to use to create ReadBackPileups + */ + public LocusIteratorByState(final SAMFileReader reader, final CloseableIterator it) { + this(new GATKSAMRecordIterator(it), + new LIBSDownsamplingInfo(false, 0), + true, + new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()), + ReadUtils.getSAMFileSamples(reader.getFileHeader()), + false); + } + + /** + * Create a new LocusIteratorByState + * + * @param samIterator the iterator of reads to process into pileups. Reads must be ordered + * according to standard coordinate-sorted BAM conventions + * @param downsamplingInfo meta-information about how to downsampling the reads + * @param genomeLocParser used to create genome locs + * @param samples a complete list of samples present in the read groups for the reads coming from samIterator. + * This is generally just the set of read group sample fields in the SAMFileHeader. This + * list of samples may contain a null element, and all reads without read groups will + * be mapped to this null sample + * @param maintainUniqueReadsList if true, we will keep the unique reads from off the samIterator and make them + * available via the transferReadsFromAllPreviousPileups interface + */ + public LocusIteratorByState(final Iterator samIterator, + final LIBSDownsamplingInfo downsamplingInfo, + final boolean includeReadsWithDeletionAtLoci, + final GenomeLocParser genomeLocParser, + final Collection samples, + final boolean maintainUniqueReadsList) { + if ( samIterator == null ) throw new IllegalArgumentException("samIterator cannot be null"); + if ( downsamplingInfo == null ) throw new IllegalArgumentException("downsamplingInfo cannot be null"); + if ( genomeLocParser == null ) throw new IllegalArgumentException("genomeLocParser cannot be null"); + if ( samples == null ) throw new IllegalArgumentException("Samples cannot be null"); + + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if (samples.isEmpty() && samIterator.hasNext()) { + throw new IllegalArgumentException("samples list must not be empty"); + } + + this.genomeLocParser = genomeLocParser; + this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator, this.samples, downsamplingInfo, maintainUniqueReadsList); + } + + @Override + public Iterator iterator() { + return this; + } + + /** + * Get the current location (i.e., the bp of the center of the pileup) of the pileup, or null if not anywhere yet + * + * Assumes that read states is updated to reflect the current pileup position, but not advanced to the + * next location. + * + * @return the location of the current pileup, or null if we're after all reads + */ + private GenomeLoc getLocation() { + return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Is there another pileup available? + * @return + */ + @Override + public boolean hasNext() { + lazyLoadNextAlignmentContext(); + return nextAlignmentContext != null; + } + + /** + * Get the next AlignmentContext available from the reads. + * + * @return a non-null AlignmentContext of the pileup after to the next genomic position covered by + * at least one read. + */ + @Override + public AlignmentContext next() { + lazyLoadNextAlignmentContext(); + if (!hasNext()) + throw new NoSuchElementException("LocusIteratorByState: out of elements."); + AlignmentContext currentAlignmentContext = nextAlignmentContext; + nextAlignmentContext = null; + return currentAlignmentContext; + } + + /** + * Move this LIBS until we are over position + * + * Will return null if cannot reach position (because we run out of data in the locus) + * + * @param position the start position of the AlignmentContext we want back + * @param stopAtFirstNonEmptySiteAfterPosition if true, we will stop as soon as we find a context with data with + * position >= position, otherwise we will return a null value + * and consume the data for the next position. This means that without + * specifying this value the LIBS will be in an indeterminate state + * after calling this function, and should be reconstructed from scratch + * for subsequent use + * @return a AlignmentContext at position, or null if this isn't possible + */ + public AlignmentContext advanceToLocus(final int position, final boolean stopAtFirstNonEmptySiteAfterPosition) { + while ( hasNext() ) { + final AlignmentContext context = next(); + + if ( context == null ) + // we ran out of data + return null; + + if ( context.getPosition() == position ) + return context; + + if ( context.getPosition() > position) + return stopAtFirstNonEmptySiteAfterPosition ? context : null; + } + + return null; + } + + /** + * Creates the next alignment context from the given state. Note that this is implemented as a + * lazy load method. nextAlignmentContext MUST BE null in order for this method to advance to the + * next entry. + */ + private void lazyLoadNextAlignmentContext() { + while (nextAlignmentContext == null && readStates.hasNext()) { + readStates.collectPendingReads(); + + final GenomeLoc location = getLocation(); + final Map fullPileup = new HashMap(); + + for (final Map.Entry sampleStatePair : readStates ) { + final String sample = sampleStatePair.getKey(); + final PerSampleReadStateManager readState = sampleStatePair.getValue(); + final Iterator iterator = readState.iterator(); + final List pile = new ArrayList(readState.size()); + + while (iterator.hasNext()) { + // state object with the read/offset information + final AlignmentStateMachine state = iterator.next(); + final GATKSAMRecord read = state.getRead(); + final CigarOperator op = state.getCigarOperator(); + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (!dontIncludeReadInPileup(read, location.getStart())) { + if ( ! includeReadsWithDeletionAtLoci && op == CigarOperator.D ) { + continue; + } + + pile.add(state.makePileupElement()); + } + } + + if (! pile.isEmpty() ) // if this pileup added at least one base, add it to the full pileup + fullPileup.put(sample, new ReadBackedPileupImpl(location, pile)); + } + + readStates.updateReadStates(); // critical - must be called after we get the current state offsets and location + if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), false); + } + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // getting the list of reads + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Transfer current list of all unique reads that have ever been used in any pileup, clearing old list + * + * This list is guaranteed to only contain unique reads, even across calls to the this function. It is + * literally the unique set of reads ever seen. + * + * The list occurs in the same order as they are encountered in the underlying iterator. + * + * Takes the maintained list of submitted reads, and transfers it to the caller of this + * function. The old list of set to a new, cleanly allocated list so the caller officially + * owns the list returned by this call. This is the only way to clear the tracking + * of submitted reads, if enabled. + * + * The purpose of this function is allow users of LIBS to keep track of all of the reads pulled off the + * underlying GATKSAMRecord iterator and that appeared at any point in the list of SAMRecordAlignmentState for + * any reads. This function is intended to allow users to efficiently reconstruct the unique set of reads + * used across all pileups. This is necessary for LIBS to handle because attempting to do + * so from the pileups coming out of LIBS is extremely expensive. + * + * This functionality is only available if LIBS was created with the argument to track the reads + * + * @throws UnsupportedOperationException if called when keepingSubmittedReads is false + * + * @return the current list + */ + @Ensures("result != null") + public List transferReadsFromAllPreviousPileups() { + return readStates.transferSubmittedReads(); + } + + /** + * Get the underlying list of tracked reads. For testing only + * @return a non-null list + */ + @Ensures("result != null") + protected List getReadsFromAllPreviousPileups() { + return readStates.getSubmittedReads(); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // utility functions + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Should this read be excluded from the pileup? + * + * Generic place to put per-base filters appropriate to LocusIteratorByState + * + * @param rec the read to potentially exclude + * @param pos the genomic position of the current alignment + * @return true if the read should be excluded from the pileup, false otherwise + */ + @Requires({"rec != null", "pos > 0"}) + private boolean dontIncludeReadInPileup(final GATKSAMRecord rec, final long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); + } + + /** + * Create a LIBSDownsamplingInfo object from the requested info in DownsamplingMethod + * + * LIBS will invoke the Reservoir and Leveling downsamplers on the read stream if we're + * downsampling to coverage by sample. SAMDataSource will have refrained from applying + * any downsamplers to the read stream in this case, in the expectation that LIBS will + * manage the downsampling. The reason for this is twofold: performance (don't have to + * split/re-assemble the read stream in SAMDataSource), and to enable partial downsampling + * of reads (eg., using half of a read, and throwing the rest away). + * + * @param downsamplingMethod downsampling information about what should be done to the reads + * @return a LIBS specific info holder about downsampling only + */ + @Requires("downsamplingMethod != null") + @Ensures("result != null") + private static LIBSDownsamplingInfo toDownsamplingInfo(final DownsamplingMethod downsamplingMethod) { + final boolean performDownsampling = downsamplingMethod != null && + downsamplingMethod.type == DownsampleType.BY_SAMPLE && + downsamplingMethod.toCoverage != null; + final int coverage = performDownsampling ? downsamplingMethod.toCoverage : 0; + + return new LIBSDownsamplingInfo(performDownsampling, coverage); + } + + /** + * Create a pileup element for read at offset + * + * offset must correspond to a valid read offset given the read's cigar, or an IllegalStateException will be throw + * + * @param read a read + * @param offset the offset into the bases we'd like to use in the pileup + * @return a valid PileupElement with read and at offset + */ + @Ensures("result != null") + public static PileupElement createPileupForReadAndOffset(final GATKSAMRecord read, final int offset) { + if ( read == null ) throw new IllegalArgumentException("read cannot be null"); + if ( offset < 0 || offset >= read.getReadLength() ) throw new IllegalArgumentException("Invalid offset " + offset + " outside of bounds 0 and " + read.getReadLength()); + + final AlignmentStateMachine stateMachine = new AlignmentStateMachine(read); + + while ( stateMachine.stepForwardOnGenome() != null ) { + if ( stateMachine.getReadOffset() == offset ) + return stateMachine.makePileupElement(); + } + + throw new IllegalStateException("Tried to create a pileup for read " + read + " with offset " + offset + + " but we never saw such an offset in the alignment state machine"); + } + + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public static List sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManager.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManager.java new file mode 100644 index 000000000..8e16c1771 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManager.java @@ -0,0 +1,261 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import htsjdk.samtools.CigarOperator; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.downsampling.Downsampler; +import org.broadinstitute.gatk.utils.downsampling.LevelingDownsampler; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * ReadStateManager for a single sample + * + * User: depristo + * Date: 1/13/13 + * Time: 12:28 PM + */ +@Invariant({ + "readStartsAreWellOrdered()", + "! isDownsampling() || downsamplingTarget > 0", + "nSites >= 0", + "nSitesNeedingDownsampling >= 0", + "nSitesNeedingDownsampling <= nSites" +}) +final class PerSampleReadStateManager implements Iterable { + private final static Logger logger = Logger.getLogger(ReadStateManager.class); + private final static boolean CAPTURE_DOWNSAMPLING_STATS = false; + + /** + * A list (potentially empty) of alignment state machines. + * + * The state machines must be ordered by the alignment start of their underlying reads, with the + * lowest alignment starts on the left, and the largest on the right + */ + private LinkedList readStatesByAlignmentStart = new LinkedList(); + + private final Downsampler> levelingDownsampler; + private final int downsamplingTarget; + + /** + * The number of sites where downsampling has been invoked + */ + private int nSitesNeedingDownsampling = 0; + + /** + * The number of sites we've visited + */ + private int nSites = 0; + + /** + * Create a new PerSampleReadStateManager with downsampling parameters as requested by LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the downsampling params we want to use + */ + public PerSampleReadStateManager(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { + this.downsamplingTarget = LIBSDownsamplingInfo.isPerformDownsampling() ? LIBSDownsamplingInfo.getToCoverage() : -1; + this.levelingDownsampler = LIBSDownsamplingInfo.isPerformDownsampling() + ? new LevelingDownsampler, AlignmentStateMachine>(LIBSDownsamplingInfo.getToCoverage()) + : null; + } + + /** + * Group the underlying readStatesByAlignmentStart into a list of list of alignment state machines, + * where each list contains machines with a unique genome site. The outer list is ordered + * by alignment start. + * + * For example, if the flat list has alignment starts [10, 10, 11, 12, 12, 13] then + * the resulting grouping will be [[10, 10], [11], [12, 12], [13]]. + * + * @return a non-null list of lists + */ + @Ensures("result != null") + private List> groupByAlignmentStart() { + final LinkedList> grouped = new LinkedList>(); + + AlignmentStateMachine last = null; + for ( final AlignmentStateMachine stateMachine : readStatesByAlignmentStart ) { + if ( last == null || stateMachine.getGenomeOffset() != last.getGenomeOffset() ) { + // we've advanced to a place where the state machine has a different state, + // so start a new list + grouped.add(new LinkedList()); + last = stateMachine; + } + grouped.getLast().add(stateMachine); + } + + return grouped; + } + + /** + * Flattens the grouped list of list of alignment state machines into a single list in order + * @return a non-null list contains the state machines + */ + @Ensures("result != null") + private LinkedList flattenByAlignmentStart(final List> grouped) { + final LinkedList flat = new LinkedList(); + for ( final List l : grouped ) + flat.addAll(l); + return flat; + } + + /** + * Test that the reads are ordered by their alignment starts + * @return true if well ordered, false otherwise + */ + private boolean readStartsAreWellOrdered() { + int lastStart = -1; + for ( final AlignmentStateMachine machine : readStatesByAlignmentStart ) { + if ( lastStart > machine.getRead().getAlignmentStart() ) + return false; + lastStart = machine.getRead().getAlignmentStart(); + } + return true; + } + + /** + * Assumes it can just keep the states linked lists without making a copy + * @param states the new states to add to this manager + * @return The change in the number of states, after including states and potentially downsampling. Note + * that this return result might be negative, if downsampling is enabled, as we might drop + * more sites than have been added by the downsampler + */ + @Requires("states != null") + public int addStatesAtNextAlignmentStart(final LinkedList states) { + if ( states.isEmpty() ) { + return 0; + } + + readStatesByAlignmentStart.addAll(states); + int nStatesAdded = states.size(); + + if ( isDownsampling() && readStatesByAlignmentStart.size() > downsamplingTarget ) { + // only go into the downsampling branch if we are downsampling and the coverage > the target + captureDownsamplingStats(); + levelingDownsampler.submit(groupByAlignmentStart()); + levelingDownsampler.signalEndOfInput(); + + nStatesAdded -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = flattenByAlignmentStart(levelingDownsampler.consumeFinalizedItems()); + levelingDownsampler.resetStats(); + } + + return nStatesAdded; + } + + /** + * Is downsampling enabled for this manager? + * @return true if we are downsampling, false otherwise + */ + private boolean isDownsampling() { + return levelingDownsampler != null; + } + + /** + * Get the leftmost alignment state machine, or null if the read states is empty + * @return a potentially null AlignmentStateMachine + */ + public AlignmentStateMachine getFirst() { + return isEmpty() ? null : readStatesByAlignmentStart.getFirst(); + } + + /** + * Capture some statistics about the behavior of the downsampling, but only if CAPTURE_DOWNSAMPLING_STATS is true + */ + @Requires("isDownsampling()") + private void captureDownsamplingStats() { + if ( CAPTURE_DOWNSAMPLING_STATS ) { + nSites++; + final int loc = getFirst().getGenomePosition(); + String message = "Pass through"; + final boolean downsampling = size() > downsamplingTarget; + if ( downsampling ) { + nSitesNeedingDownsampling++; + message = "Downsampling"; + } + + if ( downsampling || nSites % 10000 == 0 ) + logger.info(String.format("%20s at %s: coverage=%d, max=%d, fraction of downsampled sites=%.2e", + message, loc, size(), downsamplingTarget, (1.0 * nSitesNeedingDownsampling / nSites))); + } + } + + /** + * Is there at least one alignment for this sample in this manager? + * @return true if there's at least one alignment, false otherwise + */ + public boolean isEmpty() { + return readStatesByAlignmentStart.isEmpty(); + } + + /** + * Get the number of read states currently in this manager + * @return the number of read states + */ + @Ensures("result >= 0") + public int size() { + return readStatesByAlignmentStart.size(); + } + + /** + * Advances all read states forward by one element, removing states that are + * no long aligned to the current position. + * @return the number of states we're removed after advancing + */ + public int updateReadStates() { + int nRemoved = 0; + final Iterator it = iterator(); + while (it.hasNext()) { + final AlignmentStateMachine state = it.next(); + final CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + nRemoved++; + } + } + + return nRemoved; + } + + /** + * Iterate over the AlignmentStateMachine in this manager in alignment start order. + * @return a valid iterator + */ + @Ensures("result != null") + public Iterator iterator() { + return readStatesByAlignmentStart.iterator(); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/ReadStateManager.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/ReadStateManager.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/locusiterator/ReadStateManager.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/ReadStateManager.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/SamplePartitioner.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/SamplePartitioner.java new file mode 100644 index 000000000..46a88588f --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/locusiterator/SamplePartitioner.java @@ -0,0 +1,172 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.downsampling.Downsampler; +import org.broadinstitute.gatk.utils.downsampling.PassThroughDownsampler; +import org.broadinstitute.gatk.utils.downsampling.ReservoirDownsampler; + +import java.util.*; + +/** + * Divides reads by sample and (if requested) does a preliminary downsampling pass + * with a ReservoirDownsampler. + * + * Note: stores reads by sample ID string, not by sample object + */ +class SamplePartitioner { + /** + * Map from sample name (as a string) to a downsampler of reads for that sample + */ + final private Map> readsBySample; + + /** + * Are we in a state where we're done submitting reads and have semi-finalized the + * underlying per sample downsampler? + */ + boolean doneSubmittingReads = false; + + /** + * Create a new SamplePartitioner capable of splitting reads up into buckets of reads for + * each sample in samples, and perform a preliminary downsampling of these reads + * (separately for each sample) if downsampling is requested in LIBSDownsamplingInfo + * + * Note that samples must be comprehensive, in that all reads every submitted to this + * partitioner must come from one of the samples provided here. If not, submitRead + * will throw an exception. Duplicates in the list of samples will be ignored + * + * @param LIBSDownsamplingInfo do we want to downsample, and if so to what coverage? + * @param samples the complete list of samples we're going to partition reads into. Can be + * empty, but in that case this code cannot function properly if you + * attempt to add data to it. + */ + @Ensures({ + "readsBySample != null", + "readsBySample.size() == new HashSet(samples).size()" + }) + public SamplePartitioner(final LIBSDownsamplingInfo LIBSDownsamplingInfo, final List samples) { + if ( LIBSDownsamplingInfo == null ) throw new IllegalArgumentException("LIBSDownsamplingInfo cannot be null"); + if ( samples == null ) throw new IllegalArgumentException("samples must be a non-null list"); + + readsBySample = new LinkedHashMap>(samples.size()); + for ( final String sample : samples ) { + readsBySample.put(sample, createDownsampler(LIBSDownsamplingInfo)); + } + } + + /** + * Create a new, ready to use downsampler based on the parameters in LIBSDownsamplingInfo + * @param LIBSDownsamplingInfo the parameters to use in creating the downsampler + * @return a downsampler appropriate for LIBSDownsamplingInfo. If no downsampling is requested, + * uses the PassThroughDownsampler, which does nothing at all. + */ + @Requires("LIBSDownsamplingInfo != null") + @Ensures("result != null") + private Downsampler createDownsampler(final LIBSDownsamplingInfo LIBSDownsamplingInfo) { + return LIBSDownsamplingInfo.isPerformDownsampling() + ? new ReservoirDownsampler(LIBSDownsamplingInfo.getToCoverage(), true) + : new PassThroughDownsampler(); + } + + /** + * Offer this read to the partitioner, putting it into the bucket of reads for the sample + * of read (obtained via the read's read group). + * + * If the read group is missing, uses the special "null" read group + * + * @throws IllegalStateException if the sample of read wasn't present in the original + * set of samples provided to this SamplePartitioner at construction + * + * @param read the read to add to the sample's list of reads + */ + @Requires("read != null") + @Ensures("doneSubmittingReads == false") + public void submitRead(final T read) { + final String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) + throw new IllegalStateException("Offered read with sample name " + sampleName + " to SamplePartitioner " + + "but this sample wasn't provided as one of possible samples at construction"); + + downsampler.submit(read); + doneSubmittingReads = false; + } + + /** + * Tell this partitioner that all reads in this cycle have been submitted, so that we + * can finalize whatever downsampling is required by each sample. + * + * Note that we *must* call this function before getReadsForSample, or else that + * function will exception out. + */ + @Ensures("doneSubmittingReads == true") + public void doneSubmittingReads() { + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.signalEndOfInput(); + } + doneSubmittingReads = true; + } + + /** + * Get the final collection of reads for this sample for this cycle + * + * The cycle is defined as all of the reads that occur between + * the first call to submitRead until doneSubmittingReads is called. At that + * point additional downsampling may occur (depending on construction arguments) + * and that set of reads is returned here. + * + * Note that this function can only be called once per cycle, as underlying + * collection of reads is cleared. + * + * @param sampleName the sample we want reads for, must be present in the original samples + * @return a non-null collection of reads for sample in this cycle + */ + @Ensures("result != null") + public Collection getReadsForSample(final String sampleName) { + if ( ! doneSubmittingReads ) throw new IllegalStateException("getReadsForSample called before doneSubmittingReads was called"); + + final Downsampler downsampler = readsBySample.get(sampleName); + if ( downsampler == null ) throw new NoSuchElementException("Sample name not found"); + + return downsampler.consumeFinalizedItems(); + } + + /** + * Resets this SamplePartitioner, indicating that we're starting a new + * cycle of adding reads to each underlying downsampler. + */ + @Ensures("doneSubmittingReads == false") + public void reset() { + for ( final Downsampler downsampler : readsBySample.values() ) { + downsampler.clearItems(); + downsampler.resetStats(); + } + doneSubmittingReads = false; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/EOFMarkedValue.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/EOFMarkedValue.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/EOFMarkedValue.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/EOFMarkedValue.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducer.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducer.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducer.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResult.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResult.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResult.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResult.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultsQueue.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultsQueue.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultsQueue.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultsQueue.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSMapFunction.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSMapFunction.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSMapFunction.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSMapFunction.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSProgressFunction.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSProgressFunction.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSProgressFunction.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSProgressFunction.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSReduceFunction.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSReduceFunction.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSReduceFunction.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NSReduceFunction.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoScheduler.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoScheduler.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoScheduler.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoScheduler.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/Reducer.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/Reducer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/Reducer.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/nanoScheduler/Reducer.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/BatchPairHMM.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/BatchPairHMM.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/BatchPairHMM.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/BatchPairHMM.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/Log10PairHMM.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/Log10PairHMM.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/Log10PairHMM.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/Log10PairHMM.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/N2MemoryPairHMM.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/N2MemoryPairHMM.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/N2MemoryPairHMM.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/N2MemoryPairHMM.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..b84afcfdc --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMM.java @@ -0,0 +1,359 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.pairhmm; + +import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.Allele; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.genotyper.ReadLikelihoods; +import org.broadinstitute.gatk.utils.haplotype.Haplotype; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Map; +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * + * User: rpoplin + * Date: 10/16/12 + */ +public abstract class PairHMM { + protected final static Logger logger = Logger.getLogger(PairHMM.class); + + protected boolean constantsAreInitialized = false; + + protected byte[] previousHaplotypeBases; + protected int hapStartIndex; + + public static final byte BASE_QUALITY_SCORE_THRESHOLD = (byte) 18; // Base quals less than this value are squashed down to min possible qual + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING, + /* Optimized AVX implementation of LOGLESS_CACHING called through JNI */ + VECTOR_LOGLESS_CACHING, + /* Debugging for vector implementation of LOGLESS_CACHING */ + DEBUG_VECTOR_LOGLESS_CACHING, + /* Logless caching PairHMM that stores computations in 1D arrays instead of matrices, and which proceeds diagonally over the (read x haplotype) intersection matrix */ + ARRAY_LOGLESS + } + + protected int maxHaplotypeLength, maxReadLength; + protected int paddedMaxReadLength, paddedMaxHaplotypeLength; + protected int paddedReadLength, paddedHaplotypeLength; + protected boolean initialized = false; + + // only used for debugging purposes + protected boolean doNotUseTristateCorrection = false; + protected void doNotUseTristateCorrection() { doNotUseTristateCorrection = true; } + + //debug array + protected double[] mLikelihoodArray; + + //profiling information + protected static Boolean doProfiling = true; + protected static long pairHMMComputeTime = 0; + protected long threadLocalPairHMMComputeTimeDiff = 0; + protected long startTime = 0; + + /** + * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * + * Note: Do not worry about padding, just provide the true max length of the read and haplotype. The HMM will take care of the padding. + * + * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM + */ + public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + if ( readMaxLength <= 0 ) throw new IllegalArgumentException("READ_MAX_LENGTH must be > 0 but got " + readMaxLength); + if ( haplotypeMaxLength <= 0 ) throw new IllegalArgumentException("HAPLOTYPE_MAX_LENGTH must be > 0 but got " + haplotypeMaxLength); + + maxHaplotypeLength = haplotypeMaxLength; + maxReadLength = readMaxLength; + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + paddedMaxReadLength = readMaxLength + 1; + paddedMaxHaplotypeLength = haplotypeMaxLength + 1; + + previousHaplotypeBases = null; + + constantsAreInitialized = false; + initialized = true; + } + + /** + * Called at the end of PairHMM for a region - mostly used by the JNI implementations + */ + public void finalizeRegion() + { + ; + } + + /** + * Initialize this PairHMM, making it suitable to run against a read and haplotype with given lengths + * This function is used by the JNI implementations to transfer all data once to the native code + * @param haplotypes the list of haplotypes + * @param perSampleReadList map from sample name to list of reads + * @param haplotypeMaxLength the max length of haplotypes we want to use with this PairHMM + * @param readMaxLength the max length of reads we want to use with this PairHMM + */ + public void initialize( final List haplotypes, final Map> perSampleReadList, final int readMaxLength, final int haplotypeMaxLength ) { + initialize(readMaxLength, haplotypeMaxLength); + } + + private int findMaxReadLength(final GATKSAMRecord ... reads) { + int max = 0; + for (final GATKSAMRecord read : reads) { + final int readLength = read.getReadLength(); + if (max < readLength) + max = readLength; + } + return max; + } + + private int findMaxAlleleLength(final List alleles) { + int max = 0; + for (final Allele allele : alleles) { + final int alleleLength = allele.length(); + if (max < alleleLength) + max = alleleLength; + } + return max; + } + + protected int findMaxReadLength(final List reads) { + int listMaxReadLength = 0; + for(GATKSAMRecord read : reads){ + final int readLength = read.getReadLength(); + if( readLength > listMaxReadLength ) { listMaxReadLength = readLength; } + } + return listMaxReadLength; + } + + protected int findMaxHaplotypeLength(final Collection haplotypes) { + int listMaxHaplotypeLength = 0; + for( final Haplotype h : haplotypes) { + final int haplotypeLength = h.getBases().length; + if( haplotypeLength > listMaxHaplotypeLength ) { listMaxHaplotypeLength = haplotypeLength; } + } + return listMaxHaplotypeLength; + } + + /** + * Given a list of reads and haplotypes, for every read compute the total probability of said read arising from + * each haplotype given base substitution, insertion, and deletion probabilities. + * + * @param processedReads reads to analyze instead of the ones present in the destination read-likelihoods. + * @param likelihoods where to store the likelihoods where position [a][r] is reserved for the likelihood of {@code reads[r]} + * conditional to {@code alleles[a]}. + * @param gcp penalty for gap continuations base array map for processed reads. + * + * @throws IllegalArgumentException + * + * @return never {@code null}. + */ + public void computeLikelihoods(final ReadLikelihoods.Matrix likelihoods, + final List processedReads, + final Map gcp) { + if (processedReads.isEmpty()) + return; + if(doProfiling) + startTime = System.nanoTime(); + // (re)initialize the pairHMM only if necessary + final int readMaxLength = findMaxReadLength(processedReads); + final int haplotypeMaxLength = findMaxAlleleLength(likelihoods.alleles()); + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) + initialize(readMaxLength, haplotypeMaxLength); + + final int readCount = processedReads.size(); + final List alleles = likelihoods.alleles(); + final int alleleCount = alleles.size(); + mLikelihoodArray = new double[readCount * alleleCount]; + int idx = 0; + int readIndex = 0; + for(final GATKSAMRecord read : processedReads){ + final byte[] readBases = read.getReadBases(); + final byte[] readQuals = read.getBaseQualities(); + final byte[] readInsQuals = read.getBaseInsertionQualities(); + final byte[] readDelQuals = read.getBaseDeletionQualities(); + final byte[] overallGCP = gcp.get(read); + + // peak at the next haplotype in the list (necessary to get nextHaplotypeBases, which is required for caching in the array implementation) + final boolean isFirstHaplotype = true; + for (int a = 0; a < alleleCount; a++) { + final Allele allele = alleles.get(a); + final byte[] alleleBases = allele.getBases(); + final byte[] nextAlleleBases = a == alleles.size() - 1 ? null : alleles.get(a + 1).getBases(); + final double lk = computeReadLikelihoodGivenHaplotypeLog10(alleleBases, + readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextAlleleBases); + likelihoods.set(a, readIndex, lk); + mLikelihoodArray[idx++] = lk; + } + readIndex++; + } + if(doProfiling) { + threadLocalPairHMMComputeTimeDiff = (System.nanoTime() - startTime); + //synchronized(doProfiling) + { + pairHMMComputeTime += threadLocalPairHMMComputeTimeDiff; + } + } + } + + /** + * Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion + * probabilities. + * + * Note on using hapStartIndex. This allows you to compute the exact true likelihood of a full haplotypes + * given a read, assuming that the previous calculation read over a full haplotype, recaching the read values, + * starting only at the place where the new haplotype bases and the previous haplotype bases different. This + * index is 0-based, and can be computed with findFirstPositionWhereHaplotypesDiffer given the two haplotypes. + * Note that this assumes that the read and all associated quals values are the same. + * + * @param haplotypeBases the full sequence (in standard SAM encoding) of the haplotype, must be >= than read bases in length + * @param readBases the bases (in standard encoding) of the read, must be <= haplotype bases in length + * @param readQuals the phred-scaled per base substitution quality scores of read. Must be the same length as readBases + * @param insertionGOP the phred-scaled per base insertion quality scores of read. Must be the same length as readBases + * @param deletionGOP the phred-scaled per base deletion quality scores of read. Must be the same length as readBases + * @param overallGCP the phred-scaled gap continuation penalties scores of read. Must be the same length as readBases + * @param recacheReadValues if false, we don't recalculate any cached results, assuming that readBases and its associated + * parameters are the same, and only the haplotype bases are changing underneath us + * @return the log10 probability of read coming from the haplotype under the provided error model + */ + protected final double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final boolean recacheReadValues, + final byte[] nextHaploytpeBases) { + + if ( ! initialized ) throw new IllegalStateException("Must call initialize before calling computeReadLikelihoodGivenHaplotypeLog10"); + if ( haplotypeBases == null ) throw new IllegalArgumentException("haplotypeBases cannot be null"); + if ( haplotypeBases.length > maxHaplotypeLength ) throw new IllegalArgumentException("Haplotype bases is too long, got " + haplotypeBases.length + " but max is " + maxHaplotypeLength); + if ( readBases == null ) throw new IllegalArgumentException("readBases cannot be null"); + if ( readBases.length > maxReadLength ) throw new IllegalArgumentException("readBases is too long, got " + readBases.length + " but max is " + maxReadLength); + if ( readQuals.length != readBases.length ) throw new IllegalArgumentException("Read bases and read quals aren't the same size: " + readBases.length + " vs " + readQuals.length); + if ( insertionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read insertion quals aren't the same size: " + readBases.length + " vs " + insertionGOP.length); + if ( deletionGOP.length != readBases.length ) throw new IllegalArgumentException("Read bases and read deletion quals aren't the same size: " + readBases.length + " vs " + deletionGOP.length); + if ( overallGCP.length != readBases.length ) throw new IllegalArgumentException("Read bases and overall GCP aren't the same size: " + readBases.length + " vs " + overallGCP.length); + + paddedReadLength = readBases.length + 1; + paddedHaplotypeLength = haplotypeBases.length + 1; + + hapStartIndex = (recacheReadValues) ? 0 : hapStartIndex; + + // Pre-compute the difference between the current haplotype and the next one to be run + // Looking ahead is necessary for the ArrayLoglessPairHMM implementation + final int nextHapStartIndex = (nextHaploytpeBases == null || haplotypeBases.length != nextHaploytpeBases.length) ? 0 : findFirstPositionWhereHaplotypesDiffer(haplotypeBases, nextHaploytpeBases); + + double result = subComputeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, hapStartIndex, recacheReadValues, nextHapStartIndex); + + if ( result > 0.0) + throw new IllegalStateException("PairHMM Log Probability cannot be greater than 0: " + String.format("haplotype: %s, read: %s, result: %f, PairHMM: %s", new String(haplotypeBases), new String(readBases), result, this.getClass().getSimpleName())); + else if (!MathUtils.goodLog10Probability(result)) + throw new IllegalStateException("Invalid Log Probability: " + result); + + // Warning: Careful if using the PairHMM in parallel! (this update has to be taken care of). + // Warning: This assumes no downstream modification of the haplotype bases (saves us from copying the array). It is okay for the haplotype caller and the Unified Genotyper. + previousHaplotypeBases = haplotypeBases; + + // For the next iteration, the hapStartIndex for the next haploytpe becomes the index for the current haplotype + // The array implementation has to look ahead to the next haplotype to store caching info. It cannot do this if nextHapStart is before hapStart + hapStartIndex = (nextHapStartIndex < hapStartIndex) ? 0: nextHapStartIndex; + + return result; + } + + /** + * To be overloaded by subclasses to actually do calculation for #computeReadLikelihoodGivenHaplotypeLog10 + */ + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMatrix!=null", "insertionMatrix!=null", "deletionMatrix!=null"}) + protected abstract double subComputeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues, + final int nextHapStartIndex); + + /** + * Compute the first position at which two haplotypes differ + * + * If the haplotypes are exact copies of each other, returns the min length of the two haplotypes. + * + * @param haplotype1 the first haplotype1 + * @param haplotype2 the second haplotype1 + * @return the index of the first position in haplotype1 and haplotype2 where the byte isn't the same + */ + public static int findFirstPositionWhereHaplotypesDiffer(final byte[] haplotype1, final byte[] haplotype2) { + if ( haplotype1 == null || haplotype1.length == 0 ) throw new IllegalArgumentException("Haplotype1 is bad " + Arrays.toString(haplotype1)); + if ( haplotype2 == null || haplotype2.length == 0 ) throw new IllegalArgumentException("Haplotype2 is bad " + Arrays.toString(haplotype2)); + + for( int iii = 0; iii < haplotype1.length && iii < haplotype2.length; iii++ ) { + if( haplotype1[iii] != haplotype2[iii] ) { + return iii; + } + } + + return Math.min(haplotype1.length, haplotype2.length); + } + + /** + * Use number of threads to set doProfiling flag - doProfiling iff numThreads == 1 + * This function should be called only during initialization phase - single thread phase of HC + */ + public static void setNumberOfThreads(final int numThreads) + { + doProfiling = (numThreads == 1); + if(numThreads > 1) + logger.info("Performance profiling for PairHMM is disabled because HaplotypeCaller is being run with multiple threads (-nct>1) option\nProfiling is enabled only when running in single thread mode\n"); + } + + /** + * Return the results of the computeLikelihoods function + */ + public double[] getLikelihoodArray() { return mLikelihoodArray; } + /** + * Called at the end of the program to close files, print profiling information etc + */ + public void close() + { + if(doProfiling) + System.out.println("Total compute time in PairHMM computeLikelihoods() : "+(pairHMMComputeTime*1e-9)); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMModel.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMModel.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMModel.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMModel.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMReadyHaplotypes.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMReadyHaplotypes.java new file mode 100644 index 000000000..8728bb5de --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pairhmm/PairHMMReadyHaplotypes.java @@ -0,0 +1,182 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.pairhmm; + +import java.util.*; + +/** + * Collection of haplotypes sorted in a conveniently way to be run efficiently by the PairHMM. + * + * TODO not yet in use but likely to be as part of making graph-base likelihood run faster. + * TODO this could be extended to the classical PairHMM implementation simplifyling the PairHMM API. + */ +public class PairHMMReadyHaplotypes implements Iterable { + + + public class Entry { + + private final byte[] bases; + + private double likelihood = Double.NaN; + + protected Entry(final byte[] bases) { + this.bases = bases; + } + + public byte[] getBases() { + return bases; + } + + public void setLikelihood(final double lk) { + likelihood = lk; + } + + public double getLikelihood() { + return likelihood; + } + + } + + private Map> commonPrefixLength; + + private SortedSet entries; + + private int capacity; + + private final Comparator comparator = new Comparator() { + @Override + public int compare(final Entry o1, final Entry o2) { + final byte[] b1 = o1.bases; + final byte[] b2 = o2.bases; + Map b1map = commonPrefixLength.get(o1); + if (b1map == null) + commonPrefixLength.put(o1, b1map = new HashMap<>(capacity)); + Map b2map = commonPrefixLength.get(o2); + if (b2map == null) + commonPrefixLength.put(o2, b2map = new HashMap<>(capacity)); + final Integer previousI = b1map.get(o2) == null ? null : b1map.get(o2); + int i; + int result; + final int iLimit = Math.min(b1.length,b2.length); + if (previousI == null) { + for (i = 0; i < iLimit; i++) + if (b1[i] != b2[i]) + break; + b1map.put(o2,i); + b2map.put(o1,i); + } else + i = previousI; + + if (i < iLimit) + result = Byte.compare(b1[i],b2[i]); + else if (b1.length == b2.length) + result = 0; + else + result = b1.length < b2.length ? -1 : 1; + return result; + } + }; + + public PairHMMReadyHaplotypes(final int capacity) { + commonPrefixLength = new HashMap<>(capacity); + entries = new TreeSet<>(comparator); + } + + public void add(final byte[] bases) { + final Entry entry = new Entry(bases); + entries.add(entry); + } + + public int size() { + return entries.size(); + } + + @Override + public Iterator iterator() { + return new Iterator(); + } + + public class Iterator implements java.util.Iterator { + + private java.util.Iterator actualIterator; + private Entry previousEntry; + private Entry currentEntry; + private int startIndex; + private int cmp; + + private Iterator() { + actualIterator = entries.iterator(); + } + + public boolean hasNext() { + return actualIterator.hasNext(); + } + + public Entry next() { + previousEntry = currentEntry; + final Entry result = currentEntry = actualIterator.next(); + startIndex = -1; + return result; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + public byte[] bases() { + if (currentEntry == null) + throw new NoSuchElementException(); + return currentEntry.bases; + } + + public int startIndex() { + if (startIndex >= 0) + return startIndex; + else if (previousEntry == null) + return startIndex = 0; + else { + // The comparator will make sure the common-prefix-length is updated. + // The result in a field so that we avoid dead code elimination. + // perhaps I a bit paranohic but it does not harm to prevent. + cmp = comparator.compare(previousEntry,currentEntry); + return startIndex = commonPrefixLength.get(previousEntry).get(currentEntry); + } + } + + @Override + public String toString() { + return super.toString() + " cmp = " + cmp; + } + + public void setLikelihood(final double likelihood) { + if (currentEntry == null) + throw new NoSuchElementException(); + currentEntry.setLikelihood(likelihood); + } + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/MergingPileupElementIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/MergingPileupElementIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/MergingPileupElementIterator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/MergingPileupElementIterator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElement.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElement.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElement.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElement.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementFilter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementFilter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementFilter.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementFilter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementTracker.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementTracker.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementTracker.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/PileupElementTracker.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileup.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileup.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileup.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileup.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupImpl.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupImpl.java new file mode 100644 index 000000000..7c19b715b --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupImpl.java @@ -0,0 +1,1040 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.pileup; + +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.fragments.FragmentCollection; +import org.broadinstitute.gatk.utils.fragments.FragmentUtils; +import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.broadinstitute.gatk.utils.BaseUtils; + +import java.util.*; + +public class ReadBackedPileupImpl implements ReadBackedPileup { + protected final GenomeLoc loc; + protected final PileupElementTracker pileupElementTracker; + + private final static int UNINITIALIZED_CACHED_INT_VALUE = -1; + + /** + * Different then number of elements due to reduced reads + */ + private int depthOfCoverage = UNINITIALIZED_CACHED_INT_VALUE; + private int nDeletions = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of deletions + private int nMQ0Reads = UNINITIALIZED_CACHED_INT_VALUE; // cached value of the number of MQ0 reads + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This pileup will contain a list, in order of the reads, of the piled bases at + * reads[i] for all i in offsets. Does not make a copy of the data, so it's not safe to + * go changing the reads. + * + * @param loc The genome loc to associate reads wotj + * @param reads + * @param offsets + */ + public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets) { + this.loc = loc; + this.pileupElementTracker = readsOffsets2Pileup(reads, offsets); + } + + + /** + * Create a new version of a read backed pileup at loc without any aligned reads + */ + public ReadBackedPileupImpl(GenomeLoc loc) { + this(loc, new UnifiedPileupElementTracker()); + } + + /** + * Create a new version of a read backed pileup at loc, using the reads and their corresponding + * offsets. This lower level constructure assumes pileup is well-formed and merely keeps a + * pointer to pileup. Don't go changing the data in pileup. + */ + public ReadBackedPileupImpl(GenomeLoc loc, List pileup) { + if (loc == null) throw new ReviewedGATKException("Illegal null genomeloc in ReadBackedPileup"); + if (pileup == null) throw new ReviewedGATKException("Illegal null pileup in ReadBackedPileup"); + + this.loc = loc; + this.pileupElementTracker = new UnifiedPileupElementTracker(pileup); + } + + /** + * Optimization of above constructor where all of the cached data is provided + * + * @param loc + * @param pileup + */ + @Deprecated + public ReadBackedPileupImpl(GenomeLoc loc, List pileup, int size, int nDeletions, int nMQ0Reads) { + this(loc, pileup); + } + + protected ReadBackedPileupImpl(GenomeLoc loc, PileupElementTracker tracker) { + this.loc = loc; + this.pileupElementTracker = tracker; + } + + public ReadBackedPileupImpl(GenomeLoc loc, Map pileupsBySample) { + this.loc = loc; + PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); + for (Map.Entry pileupEntry : pileupsBySample.entrySet()) { + tracker.addElements(pileupEntry.getKey(), pileupEntry.getValue().pileupElementTracker); + } + this.pileupElementTracker = tracker; + } + + public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset) { + this.loc = loc; + this.pileupElementTracker = readsOffsets2Pileup(reads, offset); + } + + /** + * Helper routine for converting reads and offset lists to a PileupElement list. + * + * @param reads + * @param offsets + * @return + */ + private PileupElementTracker readsOffsets2Pileup(List reads, List offsets) { + if (reads == null) throw new ReviewedGATKException("Illegal null read list in UnifiedReadBackedPileup"); + if (offsets == null) throw new ReviewedGATKException("Illegal null offsets list in UnifiedReadBackedPileup"); + if (reads.size() != offsets.size()) + throw new ReviewedGATKException("Reads and offset lists have different sizes!"); + + UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); + for (int i = 0; i < reads.size(); i++) { + GATKSAMRecord read = reads.get(i); + int offset = offsets.get(i); + pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important + } + + return pileup; + } + + /** + * Helper routine for converting reads and a single offset to a PileupElement list. + * + * @param reads + * @param offset + * @return + */ + private PileupElementTracker readsOffsets2Pileup(List reads, int offset) { + if (reads == null) throw new ReviewedGATKException("Illegal null read list in UnifiedReadBackedPileup"); + if (offset < 0) throw new ReviewedGATKException("Illegal offset < 0 UnifiedReadBackedPileup"); + + UnifiedPileupElementTracker pileup = new UnifiedPileupElementTracker(); + for (GATKSAMRecord read : reads) { + pileup.add(createNewPileupElement(read, offset)); // only used to create fake pileups for testing so ancillary information is not important + } + + return pileup; + } + + protected ReadBackedPileupImpl createNewPileup(GenomeLoc loc, PileupElementTracker tracker) { + return new ReadBackedPileupImpl(loc, tracker); + } + + protected PileupElement createNewPileupElement(GATKSAMRecord read, int offset) { + return LocusIteratorByState.createPileupForReadAndOffset(read, offset); + } + + // -------------------------------------------------------- + // + // Special 'constructors' + // + // -------------------------------------------------------- + + /** + * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no deletions in the pileup. + * + * @return + */ + @Override + public ReadBackedPileupImpl getPileupWithoutDeletions() { + if (getNumberOfDeletions() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutDeletions(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (!p.isDeletion()) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } else { + return this; + } + } + + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If the two reads in question disagree to their basecall, + * neither read is retained. If they agree on the base, the read with the higher + * base quality observation is retained + * + * @return the newly filtered pileup + */ + @Override + public ReadBackedPileup getOverlappingFragmentFilteredPileup() { + return getOverlappingFragmentFilteredPileup(true, true); + } + + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If discardDiscordant and the two reads in question disagree to their basecall, + * neither read is retained. Otherwise, the read with the higher + * quality (base or mapping, depending on baseQualNotMapQual) observation is retained + * + * @return the newly filtered pileup + */ + @Override + public ReadBackedPileupImpl getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + } else { + Map filteredPileup = new HashMap(); + + for (PileupElement p : pileupElementTracker) { + String readName = p.getRead().getReadName(); + + // if we've never seen this read before, life is good + if (!filteredPileup.containsKey(readName)) { + filteredPileup.put(readName, p); + } else { + PileupElement existing = filteredPileup.get(readName); + + // if the reads disagree at this position, throw them both out. Otherwise + // keep the element with the higher quality score + if (discardDiscordant && existing.getBase() != p.getBase()) { + filteredPileup.remove(readName); + } else { + if (baseQualNotMapQual) { + if (existing.getQual() < p.getQual()) + filteredPileup.put(readName, p); + } + else { + if (existing.getMappingQual() < p.getMappingQual()) + filteredPileup.put(readName, p); + } + } + } + } + + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement filteredElement : filteredPileup.values()) + filteredTracker.add(filteredElement); + + return createNewPileup(loc, filteredTracker); + } + } + + + /** + * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this + * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy + * of the pileup (just returns this) if there are no MQ0 reads in the pileup. + * + * @return + */ + @Override + public ReadBackedPileupImpl getPileupWithoutMappingQualityZeroReads() { + if (getNumberOfMappingQualityZeroReads() > 0) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupWithoutMappingQualityZeroReads(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (p.getRead().getMappingQuality() > 0) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } else { + return this; + } + } + + public ReadBackedPileupImpl getPositiveStrandPileup() { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPositiveStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (!p.getRead().getReadNegativeStrandFlag()) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Gets the pileup consisting of only reads on the negative strand. + * + * @return A read-backed pileup consisting only of reads on the negative strand. + */ + public ReadBackedPileupImpl getNegativeStrandPileup() { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getNegativeStrandPileup(); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : tracker) { + if (p.getRead().getReadNegativeStrandFlag()) { + filteredTracker.add(p); + } + } + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Gets a pileup consisting of all those elements passed by a given filter. + * + * @param filter Filter to use when testing for elements. + * @return a pileup without the given filtered elements. + */ + public ReadBackedPileupImpl getFilteredPileup(PileupElementFilter filter) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getFilteredPileup(filter); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : pileupElementTracker) { + if (filter.allow(p)) + filteredTracker.add(p); + } + + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ, coming from + * reads with mapping qualities >= minMapQ. This method allocates and returns a new instance of ReadBackedPileup. + * + * @param minBaseQ + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileupImpl getBaseAndMappingFilteredPileup(int minBaseQ, int minMapQ) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ, minMapQ); + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + for (PileupElement p : pileupElementTracker) { + if (p.getRead().getMappingQuality() >= minMapQ && (p.isDeletion() || p.getQual() >= minBaseQ)) { + filteredTracker.add(p); + } + } + + return createNewPileup(loc, filteredTracker); + } + } + + /** + * Returns subset of this pileup that contains only bases with quality >= minBaseQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * + * @param minBaseQ + * @return + */ + @Override + public ReadBackedPileup getBaseFilteredPileup(int minBaseQ) { + return getBaseAndMappingFilteredPileup(minBaseQ, -1); + } + + /** + * Returns subset of this pileup that contains only bases coming from reads with mapping quality >= minMapQ. + * This method allocates and returns a new instance of ReadBackedPileup. + * + * @param minMapQ + * @return + */ + @Override + public ReadBackedPileup getMappingFilteredPileup(int minMapQ) { + return getBaseAndMappingFilteredPileup(-1, minMapQ); + } + + /** + * Gets a list of the read groups represented in this pileup. + * + * @return + */ + @Override + public Collection getReadGroups() { + Set readGroups = new HashSet(); + for (PileupElement pileupElement : this) + readGroups.add(pileupElement.getRead().getReadGroup().getReadGroupId()); + return readGroups; + } + + /** + * Gets the pileup for a given read group. Horrendously inefficient at this point. + * + * @param targetReadGroupId Identifier for the read group. + * @return A read-backed pileup containing only the reads in the given read group. + */ + @Override + public ReadBackedPileupImpl getPileupForReadGroup(String targetReadGroupId) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroup(targetReadGroupId); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (targetReadGroupId != null) { + if (read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + /** + * Gets the pileup for a set of read groups. Horrendously inefficient at this point. + * + * @param rgSet List of identifiers for the read groups. + * @return A read-backed pileup containing only the reads in the given read groups. + */ + @Override + public ReadBackedPileupImpl getPileupForReadGroups(final HashSet rgSet) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForReadGroups(rgSet); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (rgSet != null && !rgSet.isEmpty()) { + if (read.getReadGroup() != null && rgSet.contains(read.getReadGroup().getReadGroupId())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + @Override + public ReadBackedPileupImpl getPileupForLane(String laneID) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + ReadBackedPileupImpl pileup = createNewPileup(loc, perSampleElements).getPileupForLane(laneID); + if (pileup != null) + filteredTracker.addElements(sample, pileup.pileupElementTracker); + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (laneID != null) { + if (read.getReadGroup() != null && + (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different + (read.getReadGroup().getReadGroupId().equals(laneID))) // in case there is no sample identifier, they have to be exactly the same + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getReadGroupId() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + public Collection getSamples() { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + return new HashSet(tracker.getSamples()); + } else { + Collection sampleNames = new HashSet(); + for (PileupElement p : this) { + GATKSAMRecord read = p.getRead(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + sampleNames.add(sampleName); + } + return sampleNames; + } + } + + /** + * Returns a pileup randomly downsampled to the desiredCoverage. + * + * TODO: delete this once the experimental downsampler stabilizes + * + * @param desiredCoverage + * @return + */ + @Override + public ReadBackedPileup getDownsampledPileup(int desiredCoverage) { + if (getNumberOfElements() <= desiredCoverage) + return this; + + // randomly choose numbers corresponding to positions in the reads list + TreeSet positions = new TreeSet(); + for (int i = 0; i < desiredCoverage; /* no update */) { + if (positions.add(Utils.getRandomGenerator().nextInt(getNumberOfElements()))) + i++; + } + + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); + + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + + int current = 0; + UnifiedPileupElementTracker filteredPileup = new UnifiedPileupElementTracker(); + for (PileupElement p : perSampleElements) { + if (positions.contains(current)) + filteredPileup.add(p); + current++; + + } + filteredTracker.addElements(sample, filteredPileup); + } + + return createNewPileup(loc, filteredTracker); + } else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + + Iterator positionIter = positions.iterator(); + + while (positionIter.hasNext()) { + int nextReadToKeep = (Integer) positionIter.next(); + filteredTracker.add(tracker.get(nextReadToKeep)); + } + + return createNewPileup(getLocation(), filteredTracker); + } + } + + @Override + public ReadBackedPileup getPileupForSamples(Collection sampleNames) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PileupElementTracker filteredElements = tracker.getElements(sampleNames); + return filteredElements != null ? createNewPileup(loc, filteredElements) : null; + } else { + HashSet hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. + if (read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + @Override + public Map getPileupsForSamples(Collection sampleNames) { + Map result = new HashMap(); + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (String sample : sampleNames) { + PileupElementTracker filteredElements = tracker.getElements(sample); + if (filteredElements != null) + result.put(sample, createNewPileup(loc, filteredElements)); + } + } else { + Map> trackerMap = new HashMap>(); + + for (String sample : sampleNames) { // initialize pileups for each sample + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + trackerMap.put(sample, filteredTracker); + } + for (PileupElement p : pileupElementTracker) { // go through all pileup elements only once and add them to the respective sample's pileup + GATKSAMRecord read = p.getRead(); + if (read.getReadGroup() != null) { + String sample = read.getReadGroup().getSample(); + UnifiedPileupElementTracker tracker = trackerMap.get(sample); + if (tracker != null) // we only add the pileup the requested samples. Completely ignore the rest + tracker.add(p); + } + } + for (Map.Entry> entry : trackerMap.entrySet()) // create the ReadBackedPileup for each sample + result.put(entry.getKey(), createNewPileup(loc, entry.getValue())); + } + return result; + } + + + @Override + public ReadBackedPileup getPileupForSample(String sampleName) { + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + PileupElementTracker filteredElements = tracker.getElements(sampleName); + return filteredElements != null ? createNewPileup(loc, filteredElements) : null; + } else { + UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); + for (PileupElement p : pileupElementTracker) { + GATKSAMRecord read = p.getRead(); + if (sampleName != null) { + if (read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) + filteredTracker.add(p); + } else { + if (read.getReadGroup() == null || read.getReadGroup().getSample() == null) + filteredTracker.add(p); + } + } + return filteredTracker.size() > 0 ? createNewPileup(loc, filteredTracker) : null; + } + } + + // -------------------------------------------------------- + // + // iterators + // + // -------------------------------------------------------- + + /** + * The best way to access PileupElements where you only care about the bases and quals in the pileup. + *

+ * for (PileupElement p : this) { doSomething(p); } + *

+ * Provides efficient iteration of the data. + * + * @return + */ + @Override + public Iterator iterator() { + return new Iterator() { + private final Iterator wrappedIterator = pileupElementTracker.iterator(); + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public PileupElement next() { + return wrappedIterator.next(); + } + + public void remove() { + throw new UnsupportedOperationException("Cannot remove from a pileup element iterator"); + } + }; + } + + /** + * The best way to access PileupElements where you only care not only about bases and quals in the pileup + * but also need access to the index of the pileup element in the pile. + * + * for (ExtendedPileupElement p : this) { doSomething(p); } + * + * Provides efficient iteration of the data. + * + * @return + */ + + /** + * Simple useful routine to count the number of deletion bases in this pileup + * + * @return + */ + @Override + public int getNumberOfDeletions() { + if ( nDeletions == UNINITIALIZED_CACHED_INT_VALUE ) { + nDeletions = 0; + for (PileupElement p : pileupElementTracker.unorderedIterable() ) { + if (p.isDeletion()) { + nDeletions++; + } + } + } + return nDeletions; + } + + @Override + public int getNumberOfMappingQualityZeroReads() { + if ( nMQ0Reads == UNINITIALIZED_CACHED_INT_VALUE ) { + nMQ0Reads = 0; + + for (PileupElement p : pileupElementTracker.unorderedIterable()) { + if (p.getRead().getMappingQuality() == 0) { + nMQ0Reads++; + } + } + } + + return nMQ0Reads; + } + + /** + * @return the number of physical elements in this pileup + */ + @Override + public int getNumberOfElements() { + return pileupElementTracker.size(); + } + + /** + * @return the number of abstract elements in this pileup + */ + @Override + public int depthOfCoverage() { + if (depthOfCoverage == UNINITIALIZED_CACHED_INT_VALUE) { + depthOfCoverage = pileupElementTracker.size(); + } + return depthOfCoverage; + } + + /** + * @return true if there are 0 elements in the pileup, false otherwise + */ + @Override + public boolean isEmpty() { + return getNumberOfElements() == 0; + } + + + /** + * @return the location of this pileup + */ + @Override + public GenomeLoc getLocation() { + return loc; + } + + /** + * Get counts of A, C, G, T in order, which returns a int[4] vector with counts according + * to BaseUtils.simpleBaseToBaseIndex for each base. + * + * @return + */ + @Override + public int[] getBaseCounts() { + int[] counts = new int[4]; + + // TODO -- can be optimized with .unorderedIterable() + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + for (final String sample : tracker.getSamples()) { + int[] countsBySample = createNewPileup(loc, tracker.getElements(sample)).getBaseCounts(); + for (int i = 0; i < counts.length; i++) + counts[i] += countsBySample[i]; + } + } else { + for (PileupElement pile : this) { + // skip deletion sites + if (!pile.isDeletion()) { + int index = BaseUtils.simpleBaseToBaseIndex((char) pile.getBase()); + if (index != -1) + counts[index]++; + } + } + } + + return counts; + } + + @Override + public String getPileupString(Character ref) { + // In the pileup format, each line represents a genomic position, consisting of chromosome name, + // coordinate, reference base, read bases, read qualities and alignment mapping qualities. + return String.format("%s %s %c %s %s", + getLocation().getContig(), getLocation().getStart(), // chromosome name and coordinate + ref, // reference base + new String(getBases()), + getQualsString()); + } + + // -------------------------------------------------------- + // + // Convenience functions that may be slow + // + // -------------------------------------------------------- + + /** + * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time + * + * @return + */ + @Override + public List getReads() { + List reads = new ArrayList(getNumberOfElements()); + for (PileupElement pile : this) { + reads.add(pile.getRead()); + } + return reads; + } + + @Override + public int getNumberOfDeletionsAfterThisElement() { + int count = 0; + for (PileupElement p : pileupElementTracker.unorderedIterable()) { + if (p.isBeforeDeletionStart()) + count++; + } + return count; + } + + @Override + public int getNumberOfInsertionsAfterThisElement() { + int count = 0; + for (PileupElement p : pileupElementTracker.unorderedIterable()) { + if (p.isBeforeInsertion()) + count++; + } + return count; + + } + /** + * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time + * + * @return + */ + @Override + public List getOffsets() { + List offsets = new ArrayList(getNumberOfElements()); + for (PileupElement pile : pileupElementTracker.unorderedIterable()) { + offsets.add(pile.getOffset()); + } + return offsets; + } + + /** + * Returns an array of the bases in this pileup. Note this call costs O(n) and allocates fresh array each time + * + * @return + */ + @Override + public byte[] getBases() { + byte[] v = new byte[getNumberOfElements()]; + int pos = 0; + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getBase(); + } + return v; + } + + /** + * Returns an array of the quals in this pileup. Note this call costs O(n) and allocates fresh array each time + * + * @return + */ + @Override + public byte[] getQuals() { + byte[] v = new byte[getNumberOfElements()]; + int pos = 0; + for (PileupElement pile : pileupElementTracker) { + v[pos++] = pile.getQual(); + } + return v; + } + + /** + * Get an array of the mapping qualities + * + * @return + */ + @Override + public int[] getMappingQuals() { + final int[] v = new int[getNumberOfElements()]; + int pos = 0; + for ( final PileupElement pile : pileupElementTracker ) { + v[pos++] = pile.getRead().getMappingQuality(); + } + return v; + } + + static String quals2String(byte[] quals) { + StringBuilder qualStr = new StringBuilder(); + for (int qual : quals) { + qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea + char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 + qualStr.append(qualChar); + } + + return qualStr.toString(); + } + + private String getQualsString() { + return quals2String(getQuals()); + } + + /** + * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads. + * + * @return + */ + @Override + public ReadBackedPileup getStartSortedPileup() { + + final TreeSet sortedElements = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + for (PileupElement pile : perSampleElements) + sortedElements.add(pile); + } + } + else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + for (PileupElement pile : tracker) + sortedElements.add(pile); + } + + UnifiedPileupElementTracker sortedTracker = new UnifiedPileupElementTracker(); + for (PileupElement pile : sortedElements) + sortedTracker.add(pile); + + return createNewPileup(loc, sortedTracker); + } + + @Override + public FragmentCollection toFragments() { + return FragmentUtils.create(this); + } + + @Override + public ReadBackedPileup copy() { + return new ReadBackedPileupImpl(loc, pileupElementTracker.copy()); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup2/Notes b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup2/Notes similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/pileup2/Notes rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/pileup2/Notes diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemon.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterData.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterData.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterData.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterData.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/EventType.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/recalibration/EventType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/recalibration/EventType.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/recalibration/EventType.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/RODRecordListImpl.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/RODRecordListImpl.java new file mode 100644 index 000000000..79631a244 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/RODRecordListImpl.java @@ -0,0 +1,129 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; + +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.HasGenomeLocation; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Sep 10, 2009 + * Time: 6:10:48 PM + * To change this template use File | Settings | File Templates. + */ +public class RODRecordListImpl extends AbstractList implements Comparable, Cloneable, RODRecordList, HasGenomeLocation { + private List records; + private GenomeLoc location = null; + private String name = null; + + public RODRecordListImpl(String name) { + records = new ArrayList(); + this.name = name; + } + + /** + * Fully qualified constructor: instantiates a new GATKFeatureRecordList object with specified GATKFeature track name, location on the + * reference, and list of associated GATKFeatures. This is a knee-deep COPY constructor: passed name, loc, and data element + * objects will be referenced from the created GATKFeatureRecordList (so that changing them from outside will affect data + * in this object), however, the data elements will be copied into a newly + * allocated list, so that the 'data' collection argument can be modified afterwards without affecting the state + * of this record list. WARNING: this constructor is (semi-)validating: passed name and location + * are allowed to be nulls (although it maybe unsafe, use caution), but if they are not nulls, then passed non-null GATKFeature data + * elements must have same track name, and their locations must overlap with the passed 'location' argument. Null + * data elements or null 'data' collection argument are allowed as well. + * @param name the name of the track + * @param data the collection of features at this location + * @param loc the location + */ + public RODRecordListImpl(String name, Collection data, GenomeLoc loc) { + this.records = new ArrayList(data==null?0:data.size()); + this.name = name; + this.location = loc; + if ( data == null || data.size() == 0 ) return; // empty dataset, nothing to do + for ( GATKFeature r : data ) { + records.add(r); + if ( r == null ) continue; + if ( ! this.name.equals(r.getName() ) ) { + throw new ReviewedGATKException("Attempt to add GATKFeature with non-matching name "+r.getName()+" to the track "+name); + } + if ( location != null && ! location.overlapsP(r.getLocation()) ) { + throw new ReviewedGATKException("Attempt to add GATKFeature that lies outside of specified interval "+location+"; offending GATKFeature:\n"+r.toString()); + } + } + } + + + public GenomeLoc getLocation() { return location; } + public String getName() { return name; } + public Iterator iterator() { return records.iterator() ; } + public void clear() { records.clear(); } + public boolean isEmpty() { return records.isEmpty(); } + + public boolean add(GATKFeature record) { add(record, false); return true;} + + @Override + public GATKFeature get(int i) { + return records.get(i); + } + + public void add(GATKFeature record, boolean allowNameMismatch) { + if ( record != null ) { + if ( ! allowNameMismatch && ! name.equals(record.getName() ) ) + throw new ReviewedGATKException("Attempt to add GATKFeature with non-matching name "+record.getName()+" to the track "+name); + } + records.add(record); + } + + public void add(RODRecordList records ) { add( records, false ); } + + public void add(RODRecordList records, boolean allowNameMismatch) { + for ( GATKFeature record : records ) + add(record, allowNameMismatch); + } + + public int size() { return records.size() ; } + + /** + * Compares this object with the specified object for order. Returns a + * negative integer, zero, or a positive integer as this object is less + * than, equal to, or greater than the specified object. + * + * @param that the object to be compared. + * @return a negative integer, zero, or a positive integer as this object + * is less than, equal to, or greater than the specified object. + * @throws ClassCastException if the specified object's type prevents it + * from being compared to this object. + */ + public int compareTo(RODRecordList that) { + return getLocation().compareTo(that.getLocation()); //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/RefMetaDataTracker.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/RefMetaDataTracker.java new file mode 100644 index 000000000..f4fd40f7d --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/RefMetaDataTracker.java @@ -0,0 +1,497 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.util.*; + +/** + * This class represents the Reference Metadata available at a particular site in the genome. It can be + * used to conveniently lookup the RMDs at this site, as well just getting a list of all of the RMDs + * + * The standard interaction model is: + * + * Traversal system arrives at a site, which has a bunch of RMDs covering it + * Traversal passes creates a tracker and passes it to the walker + * walker calls get(rodBinding) to obtain the RMDs values at this site for the track + * associated with rodBinding. + * + * Note that this is an immutable class. Once created the underlying data structures + * cannot be modified + * + * User: mdepristo + * Date: Apr 3, 2009 + * Time: 3:05:23 PM + */ +public class RefMetaDataTracker { + // TODO: this should be a list, not a bindings, actually + private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); + + final Map bindings; + final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); + public final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker(); + + // ------------------------------------------------------------------------------------------ + // + // + // Special ENGINE interaction functions + // + // + // ------------------------------------------------------------------------------------------ + + /** + * Create an tracker with no bindings + */ + public RefMetaDataTracker() { + bindings = Collections.emptyMap(); + } + + public RefMetaDataTracker(final Collection allBindings) { + // set up the bindings + if ( allBindings.isEmpty() ) + bindings = Collections.emptyMap(); + else { + final Map tmap = new HashMap(allBindings.size()); + for ( RODRecordList rod : allBindings ) { + if ( rod != null && ! rod.isEmpty() ) + tmap.put(canonicalName(rod.getName()), rod); + } + + // ensure that no one modifies the bindings itself + bindings = Collections.unmodifiableMap(tmap); + } + } + + // ------------------------------------------------------------------------------------------ + // + // + // Generic accessors + // + // + // ------------------------------------------------------------------------------------------ + + /** + * Gets all of the Tribble features spanning this locus, returning them as a list of specific + * type T extending Feature. This function looks across all tracks to find the Features, so + * if you have two tracks A and B each containing 1 Feature, then getValues will return + * a list containing both features. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. If you want + * to get all Features without any danger of such an exception use the root Tribble + * interface Feature. + * + * @param type The type of the underlying objects bound here + * @param as above + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"type != null"}) + @Ensures("result != null") + public List getValues(final Class type) { + return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); + } + + /** + * Provides the same functionality as @link #getValues(Class) but will only include + * Features that start as the GenomeLoc provide onlyAtThisLoc. + * + * @param type The type of the underlying objects bound here + * @param onlyAtThisLoc + * @param as above + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"type != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { + return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); + } + + /** + * Uses the same logic as @link #getValues(Class) but arbitrary select one of the resulting + * elements of the list to return. That is, if there would be two elements in the result of + * @link #getValues(Class), one of these two is selected, and which one it will be isn't + * specified. Consequently, this method is only really safe if (1) you absolutely know + * that only one binding will meet the constraints of @link #getValues(Class) or (2) + * you truly don't care which of the multiple bindings available you are going to examine. + * + * If there are no bindings here, getFirstValue() return null + * + * @param type The type of the underlying objects bound here + * @param as above + * @return A random single element the RODs bound here, or null if none are bound. + */ + @Requires({"type != null"}) + public T getFirstValue(final Class type) { + return safeGetFirst(getValues(type)); + } + + /** + * Uses the same logic as @link #getValue(Class,GenomeLoc) to determine the list + * of eligible Features and @link #getFirstValue(Class) to select a single + * element from the interval list. + * + * @param type The type of the underlying objects bound here + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the RODs bound here starting at onlyAtThisLoc, or null if none are bound. + */ + @Requires({"type != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(getValues(type, onlyAtThisLoc)); + } + + /** + * Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param prioritizeThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null", "prioritizeThisLoc != null"}) + @Ensures("result != null") + public List getPrioritizedValue(final Collection> rodBindings, final GenomeLoc prioritizeThisLoc) { + final List results = new ArrayList<>(); + + for ( final RodBinding rodBinding : rodBindings ) { + + // if there's a value at the prioritized location, take it + T value = getFirstValue(rodBinding, prioritizeThisLoc); + + // otherwise, grab any one + if ( value == null ) + value = getFirstValue(rodBinding); + + // add if not null + if ( value != null ) + results.add(value); + } + + return results; + } + + /** + * Gets all of the Tribble features bound to RodBinding spanning this locus, returning them as + * a list of specific type T extending Feature. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBinding != null"}) + @Ensures("result != null") + public List getValues(final RodBinding rodBinding) { + return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), null, false, false); + } + + /** + * Gets all of the Tribble features bound to any RodBinding in rodBindings, + * spanning this locus, returning them as a list of specific type T extending Feature. + * + * Note that this function assumes that all of the bound features are instances of or + * subclasses of T. A ClassCastException will occur if this isn't the case. + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null"}) + @Ensures("result != null") + public List getValues(final Collection> rodBindings) { + List results = new ArrayList(1); + for ( RodBinding rodBinding : rodBindings ) + results.addAll(getValues(rodBinding)); + return results; + } + + /** + * The same logic as @link #getValues(RodBinding) but enforces that each Feature start at onlyAtThisLoc + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { + return addValues(rodBinding.getName(), rodBinding.getType(), new ArrayList(1), getTrackDataByName(rodBinding), onlyAtThisLoc, true, false); + } + + /** + * The same logic as @link #getValues(List) but enforces that each Feature start at onlyAtThisLoc + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) + @Ensures("result != null") + public List getValues(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { + List results = new ArrayList(1); + for ( RodBinding rodBinding : rodBindings ) + results.addAll(getValues(rodBinding, onlyAtThisLoc)); + return results; + } + + /** + * Uses the same logic as @getValues(RodBinding) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param as above + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBinding != null"}) + public T getFirstValue(final RodBinding rodBinding) { + return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), null, false, true)); + } + + /** + * Uses the same logic as @getValues(RodBinding, GenomeLoc) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBinding Only Features coming from the track associated with this rodBinding are fetched + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBinding != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final RodBinding rodBinding, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(addValues(rodBinding.getName(), rodBinding.getType(), null, getTrackDataByName(rodBinding), onlyAtThisLoc, true, true)); + } + + /** + * Uses the same logic as @getValues(List) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched + * @param as above + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBindings != null"}) + public T getFirstValue(final Collection> rodBindings) { + for ( RodBinding rodBinding : rodBindings ) { + T val = getFirstValue(rodBinding); + if ( val != null ) + return val; + } + return null; + } + + /** + * Uses the same logic as @getValues(RodBinding,GenomeLoc) to determine the list + * of eligible Features and select a single element from the resulting set + * of eligible features. + * + * @param rodBindings Only Features coming from the tracks associated with these rodBindings are fetched + * @param as above + * @param onlyAtThisLoc only Features starting at this site are considered + * @return A random single element the eligible Features found, or null if none are bound. + */ + @Requires({"rodBindings != null", "onlyAtThisLoc != null"}) + public T getFirstValue(final Collection> rodBindings, final GenomeLoc onlyAtThisLoc) { + for ( RodBinding rodBinding : rodBindings ) { + T val = getFirstValue(rodBinding, onlyAtThisLoc); + if ( val != null ) + return val; + } + return null; + } + + /** + * Is there a binding at this site to a ROD/track with the specified name? + * + * @param rodBinding the rod binding we want to know about + * @return true if any Features are bound in this tracker to rodBinding + */ + @Requires({"rodBinding != null"}) + public boolean hasValues(final RodBinding rodBinding) { + return bindings.containsKey(canonicalName(rodBinding.getName())); + } + + /** + * Get all of the RMD tracks at the current site. Each track is returned as a single compound + * object (RODRecordList) that may contain multiple RMD records associated with the current site. + * + * @return List of all tracks + */ + public List getBoundRodTracks() { + return new ArrayList(bindings.values()); + } + + /** + * The number of tracks with at least one value bound here + * @return the number of tracks with at least one bound Feature + */ + public int getNTracksWithBoundFeatures() { + return bindings.size(); + } + + // ------------------------------------------------------------------------------------------ + // Protected accessors using strings for unit testing + // ------------------------------------------------------------------------------------------ + + protected boolean hasValues(final String name) { + return bindings.containsKey(canonicalName(name)); + } + + protected List getValues(final Class type, final String name) { + return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); + } + + protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); + } + + protected T getFirstValue(final Class type, final String name) { + return safeGetFirst(getValues(type, name)); + } + + protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + return safeGetFirst(getValues(type, name, onlyAtThisLoc)); + } + + // ------------------------------------------------------------------------------------------ + // + // + // Private utility functions + // + // + // ------------------------------------------------------------------------------------------ + + /** + * Helper function for getFirst() operations that takes a list of and + * returns the first element, or null if no such element exists. + * + * @param l + * @param + * @return + */ + @Requires({"l != null"}) + private T safeGetFirst(final List l) { + return l.isEmpty() ? null : l.get(0); + } + + private List addValues(final Collection names, + final Class type, + List values, + final GenomeLoc curLocation, + final boolean requireStartHere, + final boolean takeFirstOnly ) { + for ( String name : names ) { + RODRecordList rodList = getTrackDataByName(name); // require that the name is an exact match + values = addValues(name, type, values, rodList, curLocation, requireStartHere, takeFirstOnly ); + if ( takeFirstOnly && ! values.isEmpty() ) + break; + } + + return values; + } + + + + private List addValues(final String name, + final Class type, + List values, + final RODRecordList rodList, + final GenomeLoc curLocation, + final boolean requireStartHere, + final boolean takeFirstOnly ) { + for ( GATKFeature rec : rodList ) { + if ( ! requireStartHere || rec.getLocation().getStart() == curLocation.getStart() ) { // ok, we are going to keep this thing + Object obj = rec.getUnderlyingObject(); + if (!(type.isAssignableFrom(obj.getClass()))) + throw new UserException.CommandLineException("Unable to cast track named " + name + " to type of " + type.toString() + + " it's of type " + obj.getClass()); + + T objT = (T)obj; + if ( takeFirstOnly ) { + if ( values == null ) + values = Arrays.asList(objT); + else + values.add(objT); + + break; + } else { + if ( values == null ) + values = new ArrayList(); + values.add(objT); + } + } + } + + return values == null ? Collections.emptyList() : values; + } + + /** + * Finds the reference metadata track named 'name' and returns all ROD records from that track associated + * with the current site as a RODRecordList List object. If no data track with specified name is available, + * returns defaultValue wrapped as RODRecordList object. NOTE: if defaultValue is null, it will be wrapped up + * with track name set to 'name' and location set to null; otherwise the wrapper object will have name and + * location set to defaultValue.getID() and defaultValue.getLocation(), respectively (use caution, + * defaultValue.getLocation() may be not equal to what RODRecordList's location would be expected to be otherwise: + * for instance, on locus traversal, location is usually expected to be a single base we are currently looking at, + * regardless of the presence of "extended" RODs overlapping with that location). + * @param name track name + * @return track data for the given rod + */ + private RODRecordList getTrackDataByName(final String name) { + final String luName = canonicalName(name); + RODRecordList l = bindings.get(luName); + return l == null ? EMPTY_ROD_RECORD_LIST : l; + } + + private RODRecordList getTrackDataByName(final RodBinding binding) { + return getTrackDataByName(binding.getName()); + } + + /** + * Returns the canonical name of the rod name (lowercases it) + * @param name the name of the rod + * @return canonical name of the rod + */ + private String canonicalName(final String name) { + // todo -- remove me after switch to RodBinding syntax + return name.toLowerCase(); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/ReferenceDependentFeatureCodec.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/ReferenceDependentFeatureCodec.java new file mode 100644 index 000000000..d8cbbd6be --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/ReferenceDependentFeatureCodec.java @@ -0,0 +1,42 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; + +import org.broadinstitute.gatk.utils.GenomeLocParser; + +/** + * An interface marking that a given Tribble feature/codec is actually dependent on context within the + * reference, rather than having a dependency only on the contig, start, and stop of the given feature. + * A HACK. Tribble should contain all the information in needs to decode the unqualified position of + * a feature. + */ +public interface ReferenceDependentFeatureCodec { + /** + * Sets the appropriate GenomeLocParser, providing additional context when decoding larger and more variable features. + * @param genomeLocParser The parser to supply. + */ + public void setGenomeLocParser(GenomeLocParser genomeLocParser); +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/ReferenceOrderedDatum.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/ReferenceOrderedDatum.java new file mode 100644 index 000000000..daa0a3cbe --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/ReferenceOrderedDatum.java @@ -0,0 +1,66 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; + +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.HasGenomeLocation; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:49:47 AM + * To change this template use File | Settings | File Templates. + */ +public interface ReferenceOrderedDatum extends Comparable, HasGenomeLocation { + public String getName(); + public boolean parseLine(final Object header, final String[] parts) throws IOException; + public String toString(); + public String toSimpleString(); + public String repl(); + + /** + * Used by the ROD system to determine how to split input lines + * @return Regex string delimiter separating fields + */ + public String delimiterRegex(); + + public GenomeLoc getLocation(); + public int compareTo( ReferenceOrderedDatum that ); + + /** + * Backdoor hook to read header, meta-data, etc. associated with the file. Will be + * called by the ROD system before streaming starts + * + * @param source source data file on disk from which this rod stream will be pulled + * @return a header object that will be passed to parseLine command + */ + public Object initialize(final File source) throws FileNotFoundException; +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/SeekableRODIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/SeekableRODIterator.java new file mode 100644 index 000000000..9eb4b34e9 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/SeekableRODIterator.java @@ -0,0 +1,412 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.iterators.PushbackIterator; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.LocationAwareSeekableRODIterator; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +/** + * Wrapper class for iterators over ROD objects. It is assumed that the underlying iterator can only + * perform standard next() operation, which advances it to the next ROD in the stream (i.e. reads the data file + * line by line). This iterator 1) shifts the focus from record-based traversal to position-based traversal, + * and 2) adds querying seekForward() method. + * + * Namely, this iterator's next() method advances not to the next ROD in the underlying stream, but to the next + * genomic position covered by (at least one) ROD, and returns all RODs overlapping with that position as a RODRecordList + * collection-like object. Similarly, when seekForward(interval) is called, this iterator skips all the RODs from the + * underlying stream, until it reaches specified genomic interval, and returns the list of all RODs overlapping with that interval. + * + * NOTE: this iterator has a STATE: next() operation is not allowed after a seekForward() to a non-point (extended) interval + * of length > 1. Such a call would leave the iterator in an inconsistent state. seekForward() can always be called after + * either seekForward() or next() (as long as usual ordering criteria are satisfied: the query interval location can neither + * start before the current position, nor end before the previous query end). seekForward to an interval of length 1 + * reenables next() operation. + * + * Created by IntelliJ IDEA. + * User: asivache + * Date: Sep 10, 2009 + * Time: 6:20:46 PM + * To change this template use File | Settings | File Templates. + */ +public class SeekableRODIterator implements LocationAwareSeekableRODIterator { + /** + * Header for the datasource backing this iterator. + */ + private final Object header; + + /** + * The parser, used to construct new genome locs. + */ + private final GenomeLocParser parser; + + private final SAMSequenceDictionary sequenceDictionary; + + private PushbackIterator it; + List records = null; // here we will keep a pile of records overlaping with current position; when we iterate + // and step out of record's scope, we purge it from the list + String name = null; // name of the ROD track wrapped by this iterator. Will be pulled from underlying iterator. + + int curr_position = 0; // where the iterator is currently positioned on the genome + int max_position = 0; // the rightmost stop position of currently loaded records + String curr_contig = null; // what contig the iterator is currently on + boolean next_is_allowed = true; // see discussion below. next() is illegal after seek-forward queries of length > 1 + + // the stop position of the last query. We can query only in forward direction ("seek forward"); + // it is not only the start position of every successive query that can not be before the start + // of the previous one (curr_start), but it is also illegal for a query interval to *end* before + // the end of previous query, otherwise we can end up in an inconsistent state + int curr_query_end = -1; + + // EXAMPLE of inconsistency curr_query_end guards against: + // record 1 record 2 + // ---------- ----------- + // -------------------------------------------------- REF + // ------------------------- query 1 (interval 1) + // ---------- query 2 (interval 2) + // --------------- query 3 + // + // If we query first for interval 1, both record 1 and record 2 will be loaded. + // Query for interval 2, on the other hand, should return only record 1, but after + // query 1 was performed, record 2 is already loaded from the file. If, on the other hand, + // we try to un-load it from memory, we won't be able to read it again. Hence query 2 is not + // allowed after query 1. Note also, that curr_query_end is not equivalent to max_position: + // the latter only tracks where currently loaded records end (and hence helps to re-load records); + // after query 1 is performed, max_position will be the end of record 2, but query 3 is still + // perfectly legal after query 1. + // + // IMPORTANT NOTE: it follows from the above discussion and example that next() is illegal after ANY + // seek-forward query EXCEPT those that are performed with length-1 intervals (queryInterval.start=queryinteval.stop). + // Indeed, in the example above, after, e.g., query 1 is performed, the iterator is "located" at the start + // of interval 1, but record1 and record 2 are already loaded. On the other hand, a subsequent call to next() would + // need to shift iterator's position by 1 base and return only record 1. + // + // This implementation tracks the query history and makes next() illegal after a seekforward query of length > 1, + // but re-enables next() again after a length-1 query. + + public SeekableRODIterator(Object header,SAMSequenceDictionary rodDictionary,SAMSequenceDictionary referenceDictionary,GenomeLocParser parser,CloseableIterator it) { + this.header = header; + this.parser = parser; + this.sequenceDictionary = rodDictionary; + this.it = new PushbackIterator(it); + records = new LinkedList(); + // the following is a trick: we would like the iterator to know the actual name assigned to + // the ROD implementing object we are working with. But the only way to do that is to + // get an instance of that ROD and query it for its name. Now, the only generic way we have at this point to instantiate + // the ROD is to make the underlying stream iterator to do it for us. So we are reading (or rather peeking into) + // the first line of the track data file just to get the ROD object created. + GATKFeature r = null; + if (this.it.hasNext()) r = this.it.element(); + name = (r==null?null:r.getName()); + + curr_contig = referenceDictionary.getSequence(0).getSequenceName(); + } + + /** + * Gets the header associated with the backing input stream. + * @return the ROD header. + */ + @Override + public Object getHeader() { + return header; + } + + /** + * Gets the sequence dictionary associated with the backing input stream. + * @return sequence dictionary from the ROD header. + */ + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return sequenceDictionary; + } + + + /** + * Returns true if the data we iterate over has records associated with (any, not necessarily adjacent) + * genomic position farther along the reference. + * @return + */ + public boolean hasNext() { + + // if we did not walk to the very end of the interval(s) covered by currently loaded + // annotations (records), then we definitely have data for next genomic location + if ( curr_position < max_position ) return true; + + // we are past currently loaded stuff; we have next if there are more lines to load: + return it.hasNext(); + } + + // Returns point location (i.e. genome loc of length 1) on the reference, to which this iterator will advance + // upon next call to next(). + public GenomeLoc peekNextLocation() { + if ( curr_position + 1 <= max_position ) return parser.createGenomeLoc(curr_contig,curr_position+1); + + // sorry, next reference position is not covered by the RODs we are currently holding. In this case, + // the location we will jump to upon next call to next() is the start of the next ROD record that we did + // not read yet: + if ( it.hasNext() ) { + GATKFeature r = it.element(); // peek, do not load! + return parser.createGenomeLoc(r.getLocation().getContig(),r.getLocation().getStart()); + } + return null; // underlying iterator has no more records, there is no next location! + } + + /** Advances iterator to the next genomic position that has ROD record(s) associated with it, + * and returns all the records overlapping with that position as a RODList. The location of the whole + * RODList object will be set to the smallest interval subsuming genomic intervals of all returned records. + * Note that next() is disabled (will throw an exception) after seekForward() operation with query length > 1. + * @return list of all RODs overlapping with the next "covered" genomic position + */ + public RODRecordList next() { + if ( ! next_is_allowed ) + throw new ReviewedGATKException("Illegal use of iterator: Can not advance iterator with next() after seek-forward query of length > 1"); + + curr_position++; + // curr_query_end = -1; + + if ( curr_position <= max_position ) { + + // we still have bases covered by at least one currently loaded record; + // we have to purge only subset of records, on which we moved past the end + purgeOutOfScopeRecords(); + } else { + // ooops, we are past the end of all loaded records - kill them all at once, + // load next record and reinitialize by fastforwarding current position to the start of next record + records.clear(); + GATKFeature r = it.next(); // if hasNext() previously returned true, we are guaranteed that this call to reader.next() is safe + records.add( r ); + curr_contig = r.getLocation().getContig(); + curr_position = r.getLocation().getStart(); + max_position = r.getLocation().getStop(); + } + + // current position is ste and at this point 'records' only keeps those annotations, on which we did not reach the end yet + // (we might have reloaded records completely if it was necessary); but we are not guaranteed yet that we + // hold ALL the records overlapping with the current position. Time to check if we just walked into the interval(s) + // covered by new records, so we need to load them too: + + while ( it.hasNext() ) { + GATKFeature r = it.element(); + if ( r == null ) { + it.next(); + continue; + } + + GenomeLoc currentContig = parser.createOverEntireContig(curr_contig); + GenomeLoc thatContig = r.getLocation(); + + if ( currentContig.isPast(thatContig) ) + throw new UserException("LocationAwareSeekableRODIterator: contig " +r.getLocation().getContig() + + " occurs out of order in track " + r.getName() ); + if ( currentContig.isBefore(thatContig) ) break; // next record is on a higher contig, we do not need it yet... + + if ( r.getLocation().getStart() < curr_position ) + throw new UserException("LocationAwareSeekableRODIterator: track "+r.getName() + + " is out of coordinate order on contig "+r.getLocation() + " compared to " + curr_contig + ":" + curr_position); + + if ( r.getLocation().getStart() > curr_position ) break; // next record starts after the current position; we do not need it yet + + r = it.next(); // we got here only if we do need next record, time to load it for real + + int stop = r.getLocation().getStop(); + if ( stop < curr_position ) throw new ReviewedGATKException("DEBUG: encountered contig that should have been loaded earlier"); // this should never happen + if ( stop > max_position ) max_position = stop; // max_position keeps the rightmost stop position across all loaded records + records.add(r); + } + + // 'records' and current position are fully updated. Last, we need to set the location of the whole track + // (collection of ROD records) to the genomic site we are currently looking at, and return the list + + return new RODRecordListImpl(name,records, parser.createGenomeLoc(curr_contig,curr_position)); + } + + /** + * Removes from the underlying collection the last element returned by the + * iterator (optional operation). This method can be called only once per + * call to next. The behavior of an iterator is unspecified if + * the underlying collection is modified while the iteration is in + * progress in any way other than by calling this method. + * + * @throws UnsupportedOperationException if the remove + * operation is not supported by this Iterator. + * @throws IllegalStateException if the next method has not + * yet been called, or the remove method has already + * been called after the last call to the next + * method. + */ + public void remove() { + throw new UnsupportedOperationException("LocationAwareSeekableRODIterator does not implement remove() operation"); + } + + + /** + * Returns the current "position" (not location!! ;) ) of this iterator. This method is used by the sharding + * system when it searches for available iterators in the pool that can be reused to resume traversal. + * When iterator is advanced using next(), current position + * is the same as 'location'. However, after a seekForward() query with extended interval, returned position + * will be set to the last position of the query interval, to disable (illegal) attempts to roll the iterator + * back and re-start traversal from current location. + * @return Current ending position of the iterator, or null if no position exists. + */ + public GenomeLoc position() { + if ( curr_contig == null ) return null; + if ( curr_query_end > curr_position ) { + // do not attempt to reuse this iterator if the position we need it for lies before the end of last query performed + return parser.createGenomeLoc(curr_contig,curr_query_end,curr_query_end); + } + else { + return parser.createGenomeLoc(curr_contig,curr_position); + } + } + + /** + * Seeks forward through the file until the specified interval is reached. + * The location object interval can be either a single point or an extended interval. All + * ROD records overlapping with the whole interval will be returned, or null if no such records exist. + * + * Query interval must start at or after the iterator's current location, or exception will be thrown. + * + * Query interval must end at or after the stop position of the previous query, if any, or an exception will + * be thrown: subsequent queries that end before the stop of previous ones are illegal. + * + * If seekForward() is performed to an extended (length > 1 i.e. start != stop) interval, next() operation becomes + * illegal (the iterator changes state). Only seekForward() calls are allowed thereafter, until a seekForward() call + * to a length-1 interval is performed, which re-enables next(). seekForward() queries with length-1 intervals can + * always be safely intermixed with next() (as long as ordering is respected and query intervals are at or after the + * current position). + * + * Note that in contrast to + * next() (which always advances current position of the iterator on the reference), this method scrolls + * forward ONLY if the specified interval is ahead of the current location of + * the iterator. However, if called again with the same 'interval' argument as before, seekForward will NOT + * advance, but will simply return the same ROD list as before. + * + * + * @param interval point-like genomic location to fastforward to. + * @return ROD object at (or overlapping with) the specified position, or null if no such ROD exists. + */ + public RODRecordList seekForward(GenomeLoc interval) { + + if ( interval.isBefore(parser.createOverEntireContig(curr_contig)) && + !(interval.getStart() == 0 && interval.getStop() == 0 && interval.getContig().equals(curr_contig)) ) // This criteria is syntactic sugar for 'seek to right before curr_contig' + throw new ReviewedGATKException("Out of order query: query contig "+interval.getContig()+" is located before "+ + "the iterator's current contig"); + if ( interval.getContig().equals(curr_contig) ) { + if ( interval.getStart() < curr_position ) + throw new ReviewedGATKException("Out of order query: query position "+interval +" is located before "+ + "the iterator's current position "+curr_contig + ":" + curr_position); + if ( interval.getStop() < curr_query_end ) + throw new ReviewedGATKException("Unsupported querying sequence: current query interval " + + interval+" ends before the end of previous query interval ("+curr_query_end+")"); + } + + curr_position = interval.getStart(); + curr_query_end = interval.getStop(); + + next_is_allowed = ( curr_position == curr_query_end ); // we can call next() later only if interval length is 1 + + if ( interval.getContig().equals(curr_contig) && curr_position <= max_position ) { + // some of the intervals we are currently keeping do overlap with the query interval + + purgeOutOfScopeRecords(); + } else { + // clean up and get ready for fast-forwarding towards the requested position + records.clear(); + max_position = -1; + curr_contig = interval.getContig(); + } + + // curr_contig and curr_position are set to where we asked to scroll to + + while ( it.hasNext() ) { + GATKFeature r = it.next(); + if ( r == null ) continue; + + GenomeLoc currentContig = parser.createOverEntireContig(curr_contig); + GenomeLoc thatContig = r.getLocation(); + + if ( currentContig.isPast(thatContig) ) continue; // did not reach requested contig yet + if ( currentContig.isBefore(thatContig) ) { + it.pushback(r); // next record is on the higher contig, we do not need it yet... + break; + } + + // we get here if we are on the requested contig: + + if ( r.getLocation().getStop() < curr_position ) continue; // did not reach the requested interval yet + + if ( r.getLocation().getStart() > curr_query_end ) { + // past the query interval + it.pushback(r); + break; + } + + // we get here only if interval of the record r overlaps with query interval, so the record should be loaded + if ( r.getLocation().getStop() > max_position ) max_position = r.getLocation().getStop(); + records.add(r); + } + + if ( records.size() > 0 ) { + return new RODRecordListImpl(name,records,interval); + } else { + return null; + } + + } + + /** + * Removes records that end before the curr_position from the list of currently kept records. This is a + * convenience (private) shortcut that does not perform extensive checking. In particular, it assumes that + * curr_position <= max_position, as well as that we are still on the same contig. + */ + private void purgeOutOfScopeRecords() { + Iterator i = records.iterator(); + while ( i.hasNext() ) { + GATKFeature r = i.next(); + if ( r.getLocation().getStop() < curr_position ) { + i.remove(); // we moved past the end of interval the record r is associated with, purge the record forever + } + } + + } + + @Override + public void close() { + if (this.it != null) ((CloseableIterator)this.it.getUnderlyingIterator()).close(); + } + +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java new file mode 100644 index 000000000..d2c11407a --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/VariantContextAdaptors.java @@ -0,0 +1,399 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; + +import htsjdk.samtools.util.SequenceUtil; +import htsjdk.tribble.Feature; +import htsjdk.tribble.annotation.Strand; +import htsjdk.tribble.dbsnp.OldDbSNPFeature; +import htsjdk.tribble.gelitext.GeliTextFeature; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import org.broadinstitute.gatk.utils.codecs.hapmap.RawHapMapFeature; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; +import htsjdk.variant.variantcontext.*; + +import java.util.*; + +/** + * A terrible but temporary approach to converting objects to VariantContexts. If you want to add a converter, + * you need to create a adaptor object here and register a converter from your class to this object. When tribble arrives, + * we'll use a better approach. + * + * To add a new converter: + * + * create a subclass of VCAdaptor, overloading the convert operator + * add it to the static map from input type -> converter where the input type is the object.class you want to convert + * + * That's it + * + * @author depristo@broadinstitute.org + */ +public class VariantContextAdaptors { + // -------------------------------------------------------------------------------------------------------------- + // + // Generic support routines. Do not modify + // + // -------------------------------------------------------------------------------------------------------------- + + private static Map,VCAdaptor> adaptors = new HashMap,VCAdaptor>(); + + static { + PluginManager vcAdaptorManager = new PluginManager(VCAdaptor.class); + List adaptorInstances = vcAdaptorManager.createAllTypes(); + for(VCAdaptor adaptor: adaptorInstances) + adaptors.put(adaptor.getAdaptableFeatureType(),adaptor); + } + + public static boolean canBeConvertedToVariantContext(Object variantContainingObject) { + return adaptors.containsKey(variantContainingObject.getClass()); + } + + /** generic superclass */ + public interface VCAdaptor { + /** + * Gets the type of feature that this adaptor can 'adapt' into a VariantContext. + * @return Type of adaptable feature. Must be a Tribble feature class. + */ + Class getAdaptableFeatureType(); + VariantContext convert(String name, Object input, ReferenceContext ref); + } + + public static VariantContext toVariantContext(String name, Object variantContainingObject, ReferenceContext ref) { + if ( ! adaptors.containsKey(variantContainingObject.getClass()) ) + return null; + else { + return adaptors.get(variantContainingObject.getClass()).convert(name, variantContainingObject, ref); + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // From here below you can add adaptor classes for new rods (or other types) to convert to VC + // + // -------------------------------------------------------------------------------------------------------------- + private static class VariantContextAdaptor implements VCAdaptor { + /** + * 'Null' adaptor; adapts variant contexts to variant contexts. + * @return VariantContext. + */ + @Override + public Class getAdaptableFeatureType() { return VariantContext.class; } + + // already a VC, just cast and return it + @Override + public VariantContext convert(String name, Object input, ReferenceContext ref) { + return (VariantContext)input; + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // dbSNP to VariantContext + // + // -------------------------------------------------------------------------------------------------------------- + + private static class DBSnpAdaptor implements VCAdaptor { + private static boolean isSNP(OldDbSNPFeature feature) { + return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact"); + } + + private static boolean isMNP(OldDbSNPFeature feature) { + return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range"); + } + + private static boolean isInsertion(OldDbSNPFeature feature) { + return feature.getVariantType().contains("insertion"); + } + + private static boolean isDeletion(OldDbSNPFeature feature) { + return feature.getVariantType().contains("deletion"); + } + + private static boolean isIndel(OldDbSNPFeature feature) { + return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature); + } + + public static boolean isComplexIndel(OldDbSNPFeature feature) { + return feature.getVariantType().contains("in-del"); + } + + /** + * gets the alternate alleles. This method should return all the alleles present at the location, + * NOT including the reference base. This is returned as a string list with no guarantee ordering + * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest + * frequency). + * + * @return an alternate allele list + */ + public static List getAlternateAlleleList(OldDbSNPFeature feature) { + List ret = new ArrayList(); + for (String allele : getAlleleList(feature)) + if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele); + return ret; + } + + /** + * gets the alleles. This method should return all the alleles present at the location, + * including the reference base. The first allele should always be the reference allele, followed + * by an unordered list of alternate alleles. + * + * @return an alternate allele list + */ + public static List getAlleleList(OldDbSNPFeature feature) { + List alleleList = new ArrayList(); + // add ref first + if ( feature.getStrand() == Strand.POSITIVE ) + alleleList = Arrays.asList(feature.getObserved()); + else + for (String str : feature.getObserved()) + alleleList.add(SequenceUtil.reverseComplement(str)); + if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase()) + && !alleleList.get(0).equals(feature.getNCBIRefBase()) ) + Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0); + + return alleleList; + } + + /** + * Converts non-VCF formatted dbSNP records to VariantContext. + * @return OldDbSNPFeature. + */ + @Override + public Class getAdaptableFeatureType() { return OldDbSNPFeature.class; } + + @Override + public VariantContext convert(String name, Object input, ReferenceContext ref) { + OldDbSNPFeature dbsnp = (OldDbSNPFeature)input; + + int index = dbsnp.getStart() - ref.getWindow().getStart() - 1; + if ( index < 0 ) + return null; // we weren't given enough reference context to create the VariantContext + + final byte refBaseForIndel = ref.getBases()[index]; + final boolean refBaseIsDash = dbsnp.getNCBIRefBase().equals("-"); + + boolean addPaddingBase; + if ( isSNP(dbsnp) || isMNP(dbsnp) ) + addPaddingBase = false; + else if ( isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) + addPaddingBase = refBaseIsDash || GATKVariantContextUtils.requiresPaddingBase(stripNullDashes(getAlleleList(dbsnp))); + else + return null; // can't handle anything else + + Allele refAllele; + if ( refBaseIsDash ) + refAllele = Allele.create(refBaseForIndel, true); + else if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) ) + return null; + else + refAllele = Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + dbsnp.getNCBIRefBase(), true); + + final List alleles = new ArrayList(); + alleles.add(refAllele); + + // add all of the alt alleles + for ( String alt : getAlternateAlleleList(dbsnp) ) { + if ( Allele.wouldBeNullAllele(alt.getBytes())) + alt = ""; + else if ( ! Allele.acceptableAlleleBases(alt) ) + return null; + + alleles.add(Allele.create((addPaddingBase ? (char)refBaseForIndel : "") + alt, false)); + } + + final VariantContextBuilder builder = new VariantContextBuilder(); + builder.source(name).id(dbsnp.getRsID()); + builder.loc(dbsnp.getChr(), dbsnp.getStart() - (addPaddingBase ? 1 : 0), dbsnp.getEnd() - (addPaddingBase && refAllele.length() == 1 ? 1 : 0)); + builder.alleles(alleles); + return builder.make(); + } + + private static List stripNullDashes(final List alleles) { + final List newAlleles = new ArrayList(alleles.size()); + for ( final String allele : alleles ) { + if ( allele.equals("-") ) + newAlleles.add(""); + else + newAlleles.add(allele); + } + return newAlleles; + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // GELI to VariantContext + // + // -------------------------------------------------------------------------------------------------------------- + + private static class GeliTextAdaptor implements VCAdaptor { + /** + * Converts Geli text records to VariantContext. + * @return GeliTextFeature. + */ + @Override + public Class getAdaptableFeatureType() { return GeliTextFeature.class; } + + /** + * convert to a Variant Context, given: + * @param name the name of the ROD + * @param input the Rod object, in this case a RodGeliText + * @param ref the reference context + * @return a VariantContext object + */ + @Override + public VariantContext convert(String name, Object input, ReferenceContext ref) { + GeliTextFeature geli = (GeliTextFeature)input; + if ( ! Allele.acceptableAlleleBases(String.valueOf(geli.getRefBase())) ) + return null; + Allele refAllele = Allele.create(String.valueOf(geli.getRefBase()), true); + + // make sure we can convert it + if ( geli.getGenotype().isHet() || !geli.getGenotype().containsBase(geli.getRefBase())) { + // add the reference allele + List alleles = new ArrayList(); + List genotypeAlleles = new ArrayList(); + // add all of the alt alleles + for ( char alt : geli.getGenotype().toString().toCharArray() ) { + if ( ! Allele.acceptableAlleleBases(String.valueOf(alt)) ) { + return null; + } + Allele allele = Allele.create(String.valueOf(alt), false); + if (!alleles.contains(allele) && !refAllele.basesMatch(allele.getBases())) alleles.add(allele); + + // add the allele, first checking if it's reference or not + if (!refAllele.basesMatch(allele.getBases())) genotypeAlleles.add(allele); + else genotypeAlleles.add(refAllele); + } + + Map attributes = new HashMap(); + Collection genotypes = new ArrayList(); + Genotype call = GenotypeBuilder.create(name, genotypeAlleles); + + // add the call to the genotype list, and then use this list to create a VariantContext + genotypes.add(call); + alleles.add(refAllele); + GenomeLoc loc = ref.getGenomeLocParser().createGenomeLoc(geli.getChr(),geli.getStart()); + return new VariantContextBuilder(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles).genotypes(genotypes).log10PError(-1 * geli.getLODBestToReference()).attributes(attributes).make(); + } else + return null; // can't handle anything else + } + } + + // -------------------------------------------------------------------------------------------------------------- + // + // HapMap to VariantContext + // + // -------------------------------------------------------------------------------------------------------------- + + private static class HapMapAdaptor implements VCAdaptor { + /** + * Converts HapMap records to VariantContext. + * @return HapMapFeature. + */ + @Override + public Class getAdaptableFeatureType() { return RawHapMapFeature.class; } + + /** + * convert to a Variant Context, given: + * @param name the name of the ROD + * @param input the Rod object, in this case a RodGeliText + * @param ref the reference context + * @return a VariantContext object + */ + @Override + public VariantContext convert(String name, Object input, ReferenceContext ref) { + if ( ref == null ) + throw new UnsupportedOperationException("Conversion from HapMap to VariantContext requires a reference context"); + + RawHapMapFeature hapmap = (RawHapMapFeature)input; + + int index = hapmap.getStart() - ref.getWindow().getStart(); + if ( index < 0 ) + return null; // we weren't given enough reference context to create the VariantContext + + HashSet alleles = new HashSet(); + Allele refSNPAllele = Allele.create(ref.getBase(), true); + int deletionLength = -1; + + Map alleleMap = hapmap.getActualAlleles(); + // use the actual alleles, if available + if ( alleleMap != null ) { + alleles.addAll(alleleMap.values()); + Allele deletionAllele = alleleMap.get(RawHapMapFeature.INSERTION); // yes, use insertion here (since we want the reference bases) + if ( deletionAllele != null && deletionAllele.isReference() ) + deletionLength = deletionAllele.length(); + } else { + // add the reference allele for SNPs + alleles.add(refSNPAllele); + } + + // make a mapping from sample to genotype + String[] samples = hapmap.getSampleIDs(); + String[] genotypeStrings = hapmap.getGenotypes(); + + GenotypesContext genotypes = GenotypesContext.create(samples.length); + for ( int i = 0; i < samples.length; i++ ) { + // ignore bad genotypes + if ( genotypeStrings[i].contains("N") ) + continue; + + String a1 = genotypeStrings[i].substring(0,1); + String a2 = genotypeStrings[i].substring(1); + ArrayList myAlleles = new ArrayList(2); + + // use the mapping to actual alleles, if available + if ( alleleMap != null ) { + myAlleles.add(alleleMap.get(a1)); + myAlleles.add(alleleMap.get(a2)); + } else { + // ignore indels (which we can't handle without knowing the alleles) + if ( genotypeStrings[i].contains("I") || genotypeStrings[i].contains("D") ) + continue; + + Allele allele1 = Allele.create(a1, refSNPAllele.basesMatch(a1)); + Allele allele2 = Allele.create(a2, refSNPAllele.basesMatch(a2)); + + myAlleles.add(allele1); + myAlleles.add(allele2); + alleles.add(allele1); + alleles.add(allele2); + } + + Genotype g = GenotypeBuilder.create(samples[i], myAlleles); + genotypes.add(g); + } + + long end = hapmap.getEnd(); + if ( deletionLength > 0 ) + end += (deletionLength - 1); + VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).make(); + return vc; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/package-info.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/package-info.java new file mode 100644 index 000000000..bc444b784 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/package-info.java @@ -0,0 +1,26 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManager.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManager.java new file mode 100644 index 000000000..d69a37476 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManager.java @@ -0,0 +1,280 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.tracks; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import htsjdk.tribble.NameAwareCodec; +import org.broadinstitute.gatk.utils.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.classloader.PluginManager; +import htsjdk.variant.vcf.AbstractVCFCodec; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.help.GATKDocUtils; + +import java.io.File; +import java.util.*; + + +/** + * Class for managing Tribble Feature readers available to the GATK. The features + * are dynamically determined via a PluginManager. This class provides convenient + * getter methods for obtaining FeatureDescriptor objects that collect all of the + * useful information about the Tribble Codec, Feature, and name in one place. + * + * @author depristo + */ +public class FeatureManager { + public static class FeatureDescriptor implements Comparable { + final String name; + final FeatureCodec codec; + + public FeatureDescriptor(final String name, final FeatureCodec codec) { + this.name = name; + this.codec = codec; + } + + public String getName() { + return name; + } + public String getSimpleFeatureName() { return getFeatureClass().getSimpleName(); } + public FeatureCodec getCodec() { + return codec; + } + public Class getCodecClass() { return codec.getClass(); } + public Class getFeatureClass() { return codec.getFeatureType(); } + + @Override + public String toString() { + return String.format("FeatureDescriptor name=%s codec=%s feature=%s", + getName(), getCodecClass().getName(), getFeatureClass().getName()); + } + + @Override + public int compareTo(FeatureDescriptor o) { + return getName().compareTo(o.getName()); + } + } + + private final PluginManager pluginManager; + private final Collection featureDescriptors = new TreeSet(); + private final boolean lenientVCFProcessing; + + /** + * Construct a FeatureManager without a master VCF header + */ + public FeatureManager() { + this(false); + } + + public FeatureManager(final boolean lenientVCFProcessing) { + this.lenientVCFProcessing = lenientVCFProcessing; + pluginManager = new PluginManager(FeatureCodec.class, "Codecs", "Codec"); + + for (final String rawName: pluginManager.getPluginsByName().keySet()) { + FeatureCodec codec = pluginManager.createByName(rawName); + String name = rawName.toUpperCase(); + FeatureDescriptor featureDescriptor = new FeatureDescriptor(name, codec); + featureDescriptors.add(featureDescriptor); + } + } + + /** + * Return the FeatureDescriptor whose getCodecClass().equals(codecClass). + * + * @param codecClass + * @return A FeatureDescriptor or null if none is found + */ + @Requires("codecClass != null") + public FeatureDescriptor getByCodec(Class codecClass) { + for ( FeatureDescriptor descriptor : featureDescriptors ) + if ( descriptor.getCodecClass().equals(codecClass) ) + return descriptor; + return null; + } + + /** + * Returns a collection of FeatureDescriptors that emit records of type featureClass + * + * @param featureClass + * @return A FeatureDescriptor or null if none is found + */ + @Requires("featureClass != null") + public Collection getByFeature(Class featureClass) { + Set consistentDescriptors = new TreeSet(); + + if (featureClass == null) + throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object"); + + for ( FeatureDescriptor descriptor : featureDescriptors ) { + if ( featureClass.isAssignableFrom(descriptor.getFeatureClass())) + consistentDescriptors.add(descriptor); + } + return consistentDescriptors; + } + + /** + * Return the FeatureDescriptor with getID().equals(name) + * + * @param name + * @return A FeatureDescriptor or null if none is found + */ + @Requires("name != null") + public FeatureDescriptor getByName(String name) { + for ( FeatureDescriptor descriptor : featureDescriptors ) + if ( descriptor.getName().equalsIgnoreCase(name) ) + return descriptor; + return null; + } + + /** + * Returns the FeatureDescriptor that can read the contexts of File file, is one can be determined + * + * @param file + * @return A FeatureDescriptor or null if none is found + */ + @Requires({"file != null", "file.isFile()", "file.canRead()"}) + public FeatureDescriptor getByFiletype(File file) { + List canParse = new ArrayList(); + for ( FeatureDescriptor descriptor : featureDescriptors ) + if ( descriptor.getCodec().canDecode(file.getPath()) ) { + canParse.add(descriptor); + } + + if ( canParse.size() == 0 ) + return null; + else if ( canParse.size() > 1 ) + throw new ReviewedGATKException("BUG: multiple feature descriptors can read file " + file + ": " + canParse); + else + return canParse.get(0); + } + + /** + * Returns the FeatureDescriptor associated with the type described by triplet, or null if none is found + * @param triplet + * @return + */ + @Requires("triplet != null") + public FeatureDescriptor getByTriplet(RMDTriplet triplet) { + return getByName(triplet.getType()); + } + + /** + * @return all of the FeatureDescriptors available to the GATK. Never null + */ + @Ensures("result != null") + public Collection getFeatureDescriptors() { + return Collections.unmodifiableCollection(featureDescriptors); + } + + + /** + * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load + * @return + */ + @Ensures("result != null") + public String userFriendlyListOfAvailableFeatures() { + return userFriendlyListOfAvailableFeatures(Feature.class); + } + + /** + * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load + * restricted to only Codecs producting Features consistent with the requiredFeatureType + * @return + */ + @Ensures("result != null") + public String userFriendlyListOfAvailableFeatures(Class requiredFeatureType) { + final String nameHeader="Name", featureHeader = "FeatureType", docHeader="Documentation"; + + int maxNameLen = nameHeader.length(), maxFeatureNameLen = featureHeader.length(); + for ( final FeatureDescriptor descriptor : featureDescriptors ) { + if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { + maxNameLen = Math.max(maxNameLen, descriptor.getName().length()); + maxFeatureNameLen = Math.max(maxFeatureNameLen, descriptor.getSimpleFeatureName().length()); + } + } + + StringBuilder docs = new StringBuilder(); + String format = "%" + maxNameLen + "s %" + maxFeatureNameLen + "s %s%n"; + docs.append(String.format(format, nameHeader, featureHeader, docHeader)); + for ( final FeatureDescriptor descriptor : featureDescriptors ) { + if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) { + final String DocURL = GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass()); + final String oneDoc; + if ( DocURL.contains("_sting_") ) { + oneDoc = String.format(format, + descriptor.getName(), + descriptor.getSimpleFeatureName(), + DocURL); + } else { + oneDoc = String.format(format, + descriptor.getName(), + descriptor.getSimpleFeatureName(), + "(this is an external codec and is not documented within GATK)"); + } + + docs.append(oneDoc); + } + } + + return docs.toString(); + } + + /** + * Create a new FeatureCodec of the type described in descriptor, assigning it the + * name (if possible) and providing it the genomeLocParser (where necessary) + * + * @param descriptor FeatureDescriptor of the Tribble FeatureCodec we want to create + * @param name the name to assign this codec + * @param genomeLocParser GenomeLocParser for ReferenceDependentFeatureCodecs + * @param remappedSampleName replacement sample name for single-sample vcfs, or null if we're not performing + * sample name remapping + * @return the feature codec itself + */ + @Requires({"descriptor != null", "name != null", "genomeLocParser != null"}) + @Ensures("result != null") + public FeatureCodec createCodec(final FeatureDescriptor descriptor, final String name, final GenomeLocParser genomeLocParser, + final String remappedSampleName) { + FeatureCodec codex = pluginManager.createByType(descriptor.getCodecClass()); + if ( codex instanceof NameAwareCodec ) + ((NameAwareCodec)codex).setName(name); + if ( codex instanceof ReferenceDependentFeatureCodec ) + ((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser); + if ( codex instanceof AbstractVCFCodec ) { + if ( lenientVCFProcessing ) { + ((AbstractVCFCodec)codex).disableOnTheFlyModifications(); + } + if ( remappedSampleName != null ) { + ((AbstractVCFCodec)codex).setRemappedSampleName(remappedSampleName); + } + } + + return codex; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java new file mode 100644 index 000000000..a0473c8a4 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/IndexDictionaryUtils.java @@ -0,0 +1,114 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.tracks; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.MutableIndex; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.SequenceDictionaryUtils; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Utilities for working with Sequence Dictionaries embedded in tribble indices + * + * @author Your Name + * @since Date created + */ +public class IndexDictionaryUtils { + private final static Logger logger = Logger.getLogger(IndexDictionaryUtils.class); + + // a constant we use for marking sequence dictionary entries in the Tribble index property list + public static final String SequenceDictionaryPropertyPredicate = "DICT:"; + + /** + * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index + * @param index the index file to use + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + for (Map.Entry entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + static SAMSequenceDictionary createSequenceDictionaryFromContigList(final Index index, final SAMSequenceDictionary dict) { + final List seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (final String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + /** + * Sets the sequence dictionary of the given index. THE INDEX MUST BE MUTABLE (i.e. not Tabix). + * + * @param index the (mutable) index file to use + * @param dict the dictionary to use + */ + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { + for ( SAMSequenceRecord seq : dict.getSequences() ) { + final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); + final String length = String.valueOf(seq.getSequenceLength()); + ((MutableIndex)index).addProperty(contig, length); + } + } + + public static void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict, + final ValidationExclusion.TYPE validationExclusionType ) { + // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) + logger.warn("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + else { + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict, false, null); + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java new file mode 100644 index 000000000..ef8b27dcc --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrack.java @@ -0,0 +1,147 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.tracks; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.util.CloseableIterator; +import org.apache.log4j.Logger; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.CloseableTribbleIterator; +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import org.broadinstitute.gatk.utils.refdata.utils.FeatureToGATKFeatureIterator; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.IOException; + + +/** + * @author aaron + *

+ * Class RMDTrack + *

+ * the basics of what a reference metadata track must contain. + */ +public class RMDTrack { + private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); + + // the basics of a track: + private final Class type; // our type + private final String name; // the name + private final File file; // the associated file we create the reader from + + // our feature reader - allows queries + private AbstractFeatureReader reader; + + // our sequence dictionary, which can be null + private final SAMSequenceDictionary dictionary; + + /** + * Parser to use when creating/parsing GenomeLocs. + */ + private final GenomeLocParser genomeLocParser; + + // our codec type + private final FeatureCodec codec; + + public Class getType() { + return type; + } + + public String getName() { + return name; + } + + public File getFile() { + return file; + } + + /** + * Create a track + * + * @param type the type of track, used for track lookup + * @param name the name of this specific track + * @param file the associated file, for reference or recreating the reader + * @param reader the feature reader to use as the underlying data source + * @param dict the sam sequence dictionary + * @param codec the feature codec we use to decode this type + */ + public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) { + this.type = type; + this.name = name; + this.file = file; + this.reader = reader; + this.dictionary = dict; + this.genomeLocParser = genomeLocParser; + this.codec = codec; + } + + /** + * @return how to get an iterator of the underlying data. This is all a track has to support, + * but other more advanced tracks support the query interface + */ + public CloseableIterator getIterator() { + try { + return new FeatureToGATKFeatureIterator(genomeLocParser,reader.iterator(),this.getName()); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(getFile(), "Unable to read from file", e); + } + } + + public CloseableIterator query(GenomeLoc interval) throws IOException { + CloseableTribbleIterator iter = reader.query(interval.getContig(),interval.getStart(),interval.getStop()); + return new FeatureToGATKFeatureIterator(genomeLocParser, iter, this.getName()); + } + + public void close() { + try { + reader.close(); + } catch (IOException e) { + throw new UserException.MalformedFile("Unable to close reader " + reader.toString(),e); + } + reader = null; + } + + /** + * get the sequence dictionary from the track, if available + * @return a SAMSequenceDictionary if available, null if unavailable + */ + public SAMSequenceDictionary getSequenceDictionary() { + return dictionary; + } + + public Object getHeader() { + return reader.getHeader(); + } + + public FeatureCodec getCodec() { + return codec; + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java new file mode 100644 index 000000000..86a561ade --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilder.java @@ -0,0 +1,429 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.tracks; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.apache.log4j.Logger; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.FeatureCodec; +import htsjdk.tribble.Tribble; +import htsjdk.tribble.TribbleException; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.IndexFactory; +import htsjdk.tribble.util.LittleEndianOutputStream; +import org.broadinstitute.gatk.utils.commandline.ArgumentTypeDescriptor; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.ValidationExclusion; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet.RMDStorageType; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.file.FSLockWithShared; +import org.broadinstitute.gatk.utils.instrumentation.Sizeof; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Map; + + +/** + * + * @author aaron + * ` + * Class RMDTrackBuilder + * + * This class keeps track of the available codecs, and knows how to put together a track of + * that gets iterators from the FeatureReader using Tribble. + * + */ +public class RMDTrackBuilder { // extends PluginManager { + /** + * our log, which we use to capture anything from this class + */ + private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); + + // private sequence dictionary we use to set our tracks with + private final SAMSequenceDictionary dict; + + /** + * Private genome loc parser to use when building out new locs. + */ + private final GenomeLocParser genomeLocParser; + + /** + * Validation exclusions, for validating the sequence dictionary. + */ + private ValidationExclusion.TYPE validationExclusionType; + + private final FeatureManager featureManager; + + // If true, do not attempt to create index files if they don't exist or are outdated, and don't + // make any file lock acquisition calls on the index files. + private final boolean disableAutoIndexCreation; + + // Map of file name -> new sample name used when performing on-the-fly sample renaming + private final Map sampleRenameMap; + + /** + * Construct an RMDTrackerBuilder, allowing the user to define tracks to build after-the-fact. This is generally + * used when walkers want to directly manage the ROD system for whatever reason. Before using this constructor, + * please talk through your approach with the SE team. + * @param dict Sequence dictionary to use. + * @param genomeLocParser Location parser to use. + * @param validationExclusionType Types of validations to exclude, for sequence dictionary verification. + * @param disableAutoIndexCreation Do not auto-create index files, and do not use file locking when accessing index files. + * UNSAFE in general (because it causes us not to lock index files before reading them) -- + * suitable only for test suite use. + * @param sampleRenameMap Map of file name -> new sample name used when performing on-the-fly sample renaming + */ + public RMDTrackBuilder(final SAMSequenceDictionary dict, + final GenomeLocParser genomeLocParser, + final ValidationExclusion.TYPE validationExclusionType, + final boolean disableAutoIndexCreation, + final Map sampleRenameMap) { + this.dict = dict; + this.validationExclusionType = validationExclusionType; + this.genomeLocParser = genomeLocParser; + this.featureManager = new FeatureManager(ValidationExclusion.lenientVCFProcessing(validationExclusionType)); + this.disableAutoIndexCreation = disableAutoIndexCreation; + this.sampleRenameMap = sampleRenameMap; + } + + /** + * Return the feature manager this RMDTrackBuilder is using the create tribble tracks + * + * @return + */ + public FeatureManager getFeatureManager() { + return featureManager; + } + + /** + * create a RMDTrack of the specified type + * + * @param fileDescriptor a description of the type of track to build. + * + * @return an instance of the track + */ + public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { + String name = fileDescriptor.getName(); + File inputFile = new File(fileDescriptor.getFile()); + + FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); + if (descriptor == null) + throw new UserException.BadArgumentValue("-B",fileDescriptor.getType()); + + // return a feature reader track + Pair pair; + if (ArgumentTypeDescriptor.isCompressed(inputFile.toString())) + pair = createTabixIndexedFeatureSource(descriptor, name, inputFile); + else + pair = getFeatureSource(descriptor, name, inputFile, fileDescriptor.getStorageType()); + if (pair == null) throw new UserException.CouldNotReadInputFile(inputFile, "Unable to make the feature reader for input file"); + return new RMDTrack(descriptor.getCodecClass(), name, inputFile, pair.first, pair.second, genomeLocParser, createCodec(descriptor, name, inputFile)); + } + + /** + * Convenience method simplifying track creation. Assume unnamed track based on a file rather than a stream. + * @param codecClass Type of Tribble codec class to build. + * @param inputFile Input file type to use. + * @return An RMDTrack, suitable for accessing reference metadata. + */ + public RMDTrack createInstanceOfTrack(Class codecClass, File inputFile) { + final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); + + if (descriptor == null) + throw new ReviewedGATKException("Unable to find type name for codec class " + codecClass.getName()); + + return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); + } + + /** + * create a feature reader, without assuming there exists an index. This code assumes the feature + * reader of the appropriate type will figure out what the right index type is, and determine if it + * exists. + * + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create + * @param name the name of the track + * @param inputFile the file to load + * @return a feature reader implementation + */ + private Pair createTabixIndexedFeatureSource(FeatureManager.FeatureDescriptor descriptor, String name, File inputFile) { + // we might not know the index type, try loading with the default reader constructor + logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); + try { + // getFeatureReader will detect that it's Tabix + return new Pair<>(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile)), null); + } catch (TribbleException e) { + throw new UserException(e.getMessage(), e); + } + } + + /** + * add a name to the codec, if it takes one + * @param descriptor the class to create a codec for + * @param name the name to assign this codec + * @param inputFile input file that we will be decoding + * @return the feature codec itself + */ + private FeatureCodec createCodec(final FeatureManager.FeatureDescriptor descriptor, final String name, final File inputFile) { + // The remappedSampleName will be null if either no on-the-fly sample renaming was requested, + // or the user's sample rename map file didn't contain an entry for this file: + final String remappedSampleName = sampleRenameMap != null ? sampleRenameMap.get(inputFile.getAbsolutePath()) : null; + + return featureManager.createCodec(descriptor, name, genomeLocParser, remappedSampleName); + } + + /** + * create a feature source object given: + * @param descriptor the FeatureDescriptor describing the FeatureCodec we want to create + * @param name the name of the codec + * @param inputFile the tribble file to parse + * @param storageType How the RMD is streamed into the input file. + * @return the input file as a FeatureReader + */ + private Pair getFeatureSource(FeatureManager.FeatureDescriptor descriptor, + String name, + File inputFile, + RMDStorageType storageType) { + // Feature source and sequence dictionary to use as the ultimate reference + AbstractFeatureReader featureSource = null; + SAMSequenceDictionary sequenceDictionary = null; + + // Detect whether or not this source should be indexed. + boolean canBeIndexed = (storageType == RMDStorageType.FILE); + + if(canBeIndexed) { + try { + Index index = loadIndex(inputFile, createCodec(descriptor, name, inputFile)); + try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } + catch (ReviewedGATKException e) { } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); + + // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match + if (sequenceDictionary.size() == 0 && dict != null) { + validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); + + if ( ! disableAutoIndexCreation ) { + File indexFile = Tribble.indexFile(inputFile); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not affect your run of the GATK"); + } + } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); + } + + featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile), index); + } + catch (TribbleException e) { + throw new UserException(e.getMessage()); + } + catch (IOException e) { + throw new UserException("I/O error loading or writing tribble index file for " + inputFile.getAbsolutePath(), e); + } + } + else { + featureSource = AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name, inputFile), false); + } + + return new Pair(featureSource,sequenceDictionary); + } + + /** + * create an index for the input file + * @param inputFile the input file + * @param codec the codec to use + * @return a linear index for the specified type + * @throws IOException if we cannot write the index file + */ + public synchronized Index loadIndex( final File inputFile, final FeatureCodec codec) throws IOException { + final File indexFile = Tribble.indexFile(inputFile); + final FSLockWithShared lock = new FSLockWithShared(indexFile); + Index idx = null; + + // If the index file exists and is readable, attempt to load it from disk. We'll get null back + // if a problem was discovered with the index file when it was inspected, and we'll get an + // in-memory index back in the case where the index file could not be locked. + if (indexFile.canRead()) { + idx = disableAutoIndexCreation ? loadFromDisk(inputFile, indexFile) // load without locking if we're in disableAutoIndexCreation mode + : attemptToLockAndLoadIndexFromDisk(inputFile, codec, indexFile, lock); + } + + // If we have an index, it means we either loaded it from disk without issue or we created an in-memory + // index due to not being able to acquire a lock. + if (idx != null) return idx; + + // We couldn't read the file, or we discovered a problem with the index file, so continue on to making a new index + idx = createIndexInMemory(inputFile, codec); + if ( ! disableAutoIndexCreation ) { + writeIndexToDisk(idx, indexFile, lock); + } + return idx; + } + + /** + * Attempt to acquire a shared lock and then load the index from disk. Returns an in-memory index if + * a lock could not be obtained. Returns null if a problem was discovered with the index file when it + * was examined (eg., it was out-of-date). + * + * @param inputFile the input file + * @param codec the codec to read from + * @param indexFile the index file itself + * @param lock the lock file + * @return an index, or null if we couldn't load one + * @throws IOException if we fail for FS issues + */ + protected Index attemptToLockAndLoadIndexFromDisk( final File inputFile, final FeatureCodec codec, final File indexFile, final FSLockWithShared lock ) throws IOException { + boolean locked = false; + Index idx = null; + + try { + locked = lock.sharedLock(); + + if ( ! locked ) { // can't lock file + logger.info(String.format("Could not acquire a shared lock on index file %s, falling back to using an in-memory index for this GATK run.", + indexFile.getAbsolutePath())); + idx = createIndexInMemory(inputFile, codec); + } + else { + idx = loadFromDisk(inputFile, indexFile); + } + } finally { + if (locked) lock.unlock(); + } + return idx; + } + + /** + * load the index from disk, checking for out of date indexes and old versions (both of which are deleted) + * @param inputFile the input file + * @param indexFile the input file, plus the index extension + * @return an Index, or null if we're unable to load + */ + protected Index loadFromDisk( final File inputFile, final File indexFile ) { + logger.debug("Loading Tribble index from disk for file " + inputFile); + Index index = IndexFactory.loadIndex(indexFile.getAbsolutePath()); + + // check if the file is up-to date (filestamp and version check) + if (index.isCurrentVersion() && indexFile.lastModified() >= inputFile.lastModified()) + return index; + else if (indexFile.lastModified() < inputFile.lastModified()) + logger.warn("Index file " + indexFile + " is out of date (index older than input file), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); + else // we've loaded an old version of the index, we want to remove it <-- currently not used, but may re-enable + logger.warn("Index file " + indexFile + " is out of date (old version), " + + (disableAutoIndexCreation ? "falling back to an in-memory index" : "deleting and updating the index file")); + + if ( ! disableAutoIndexCreation ) { + boolean deleted = indexFile.delete(); + if (!deleted) logger.warn("Index file " + indexFile + " is out of date, but could not be removed; it will not be trusted (we'll try to rebuild an in-memory copy)"); + } + + return null; + } + + + /** + * attempt to write the index to disk + * @param index the index to write to disk + * @param indexFile the index file location + * @param lock the locking object + * @throws IOException when unable to create the new index + */ + private void writeIndexToDisk( final Index index, final File indexFile, final FSLockWithShared lock ) throws IOException { + if ( disableAutoIndexCreation ) { + return; + } + + boolean locked = false; + + try { + locked = lock.exclusiveLock(); + + if (locked) { + logger.info("Writing Tribble index to disk for file " + indexFile); + LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(indexFile)); + index.write(stream); + stream.close(); + } + else // we can't write it to disk, just store it in memory, tell them this + logger.warn("Unable to write to " + indexFile + " for the index file, creating index in memory only"); + + try { logger.info(String.format(" Index for %s has size in bytes %d", indexFile, Sizeof.getObjectGraphSize(index))); } + catch ( ReviewedGATKException e) { } + } + finally { + if (locked) lock.unlock(); + } + + } + + /** + * create the index in memory, given the input file and feature codec + * @param inputFile the input file + * @param codec the codec + * @return a LinearIndex, given the file location + * @throws IOException when unable to create the index in memory + */ + protected Index createIndexInMemory(File inputFile, FeatureCodec codec) { + // this can take a while, let them know what we're doing + logger.debug("Creating Tribble index in memory for file " + inputFile); + Index idx = IndexFactory.createDynamicIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); + return idx; + } + + /** + * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. + * (that each contig in the index is in the sequence dictionary). + * @param inputFile for proper error message formatting. + * @param dict the sequence dictionary + * @param index the index file + */ + public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { + if (dict == null) throw new ReviewedGATKException("BUG: dict cannot be null"); + + // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set + final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); + + // actually update the dictionary in the index + IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); + } + + public void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict ) { + IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIterator.java new file mode 100644 index 000000000..013a6c2ad --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIterator.java @@ -0,0 +1,74 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.tribble.CloseableTribbleIterator; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.GenomeLocParser; + + +/** + * + * @author aaron + * + * Class FeatureToGATKFeatureIterator + * + * a wrapper on Tribble feature iterators so that they produce GATKFeatures (which produce GenomeLocs) + */ +public class FeatureToGATKFeatureIterator implements CloseableIterator { + private final GenomeLocParser genomeLocParser; + private final CloseableTribbleIterator iterator; + private final String name; + + public FeatureToGATKFeatureIterator(GenomeLocParser genomeLocParser,CloseableTribbleIterator iter, String name) { + this.genomeLocParser = genomeLocParser; + this.name = name; + this.iterator = iter; + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public GATKFeature next() { + return new GATKFeature.TribbleGATKFeature(genomeLocParser,iterator.next(),name); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Why does Iterator have this method? We always throw an exception here"); + } + + @Override + public void close() { + // The private adapted iterator may not be passed on by the method constructing this object, + // leaving only this adapter to close the wrapped iterator. + iterator.close(); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/FlashBackIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/FlashBackIterator.java new file mode 100644 index 000000000..73ebf3cc8 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/FlashBackIterator.java @@ -0,0 +1,221 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.HasGenomeLocation; + +import java.util.Comparator; +import java.util.LinkedList; + + +/** + * + * @author aaron + * + * Class FlashBackIterator + * + * better than acid washed jeans...more like a Delorean that flies through time + * + * This iterator buffers a certain amount of ROD data to 'flash back' to. This + * is needed for using ROD's in read traversals, because between shards we sometimes + * (actually often) need to go back to before the current iterators location and + * get RODs that overlap the current read. + */ +public class FlashBackIterator implements LocationAwareSeekableRODIterator { + private LocationAwareSeekableRODIterator iterator; + private LinkedList pastQueue = new LinkedList(); + private LinkedList aheadQueue = new LinkedList(); + private int MAX_QUEUE = 200; + + /** + * create a flashback iterator + * @param iterator given a LocationAwareSeekableRODIterator + */ + public FlashBackIterator(LocationAwareSeekableRODIterator iterator) { + this.iterator = iterator; + } + + /** + * Gets the header associated with the backing input stream. + * @return the ROD header. + */ + @Override + public Object getHeader() { + return iterator.getHeader(); + } + + /** + * Gets the sequence dictionary associated with the backing input stream. + * @return sequence dictionary from the ROD header. + */ + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return iterator.getSequenceDictionary(); + } + + + /** + * peek at the next location + * @return + */ + @Override + public GenomeLoc peekNextLocation() { + return (aheadQueue.size() > 0) ? aheadQueue.getFirst().getLocation() : iterator.peekNextLocation(); + } + + /** + * get the position of this iterator + * @return + */ + @Override + public GenomeLoc position() { + return (aheadQueue.size() > 0) ? aheadQueue.getFirst().getLocation() : iterator.position(); + } + + /** + * seek forward on the iterator + * @param interval the interval to seek to + * @return a RODRecordList at that location, null otherwise + */ + @Override + public RODRecordList seekForward(GenomeLoc interval) { + + RODRecordList lt = iterator.seekForward(interval); + createPastRecord(lt); + return lt; + } + + /** + * do we have a next record + * @return true if we have another record + */ + @Override + public boolean hasNext() { + return (aheadQueue.size() > 0 || iterator.hasNext()); + } + + /** + * get the next record + * @return a RODRecordList + */ + @Override + public RODRecordList next() { + return getNext(); + } + + /** + * we don't support remove + */ + @Override + public void remove() { + throw new UnsupportedOperationException("We don't support remove"); + } + + /** + * get the next record, either from the queue or from the iterator + * @return a RODRecordList + */ + private RODRecordList getNext() { + if (aheadQueue.size() > 0) { + RODRecordList ret = aheadQueue.getFirst().getList(); + aheadQueue.removeFirst(); + return ret; + } else { + RODRecordList ret = iterator.next(); + createPastRecord(ret); + return ret; + } + } + + private void createPastRecord(RODRecordList ret) { + ComparableList rec = new ComparableList(ret); + if (rec.getLocation() != null) pastQueue.addLast(new ComparableList(ret)); + if (pastQueue.size() > this.MAX_QUEUE) pastQueue.removeFirst(); + } + + /** + * can we flash back to the specified location? + * + * @param location the location to try and flash back to + * + * @return true if we can, false otherwise + */ + public boolean canFlashBackTo(GenomeLoc location) { + GenomeLoc farthestBack = (pastQueue.size() > 0) ? pastQueue.getFirst().getLocation() : iterator.peekNextLocation(); + return (!farthestBack.isPast(location)); + } + + /** + * flashback! Throws an unsupported operation exception + * + * @param location where to flash back to + */ + public void flashBackTo(GenomeLoc location) { + if (!canFlashBackTo(location)) throw new UnsupportedOperationException("we can't flash back to " + location); + if (pastQueue.size()==0) return; // the iterator can do it alone + while (pastQueue.size() > 0 && !pastQueue.getLast().getLocation().isBefore(location)) { + aheadQueue.addFirst(pastQueue.getLast()); + pastQueue.removeLast(); + } + } + + public void close() { + this.aheadQueue.clear(); + this.pastQueue.clear(); + } +} + +/** + * a list that buffers the location for this rod + */ +class ComparableList implements Comparator, HasGenomeLocation { + private RODRecordList list; + private GenomeLoc location = null; + public ComparableList(RODRecordList list) { + this.list = list; + if (list != null && list.size() != 0) + location = list.getLocation(); + } + + @Override + public int compare(ComparableList list1, ComparableList list2) { + if (list1.location == null && list2.location == null) + return 0; + if (list1.location == null) return 1; + if (list2.location == null) return -1; + return (list1.location.compareTo(list2.location)); + } + + public GenomeLoc getLocation() { + return location; + } + + public RODRecordList getList() { + return list; + } +} \ No newline at end of file diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/GATKFeature.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/GATKFeature.java new file mode 100644 index 000000000..e638ab48e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/GATKFeature.java @@ -0,0 +1,108 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.HasGenomeLocation; + + +/** + * + * @author aaron + * + * Class GATKFeature + * + * This wraps a Tribble feature or a RODatum so that both present the same interface: a genome loc for position and a + * way of retrieving the track name. + */ +public abstract class GATKFeature implements Feature, HasGenomeLocation { + + public GATKFeature(String name) { + this.name = name; + } + + String name; + + protected void setName(String name) { + this.name = name; + } + + public String getName() { + return name; + } + + public abstract GenomeLoc getLocation(); + + // TODO: this should be a Feature + public abstract Object getUnderlyingObject(); + + /** + * wrapping a Tribble feature in a GATK friendly interface + */ + public static class TribbleGATKFeature extends GATKFeature { + private final GenomeLocParser genomeLocParser; + private final Feature feature; + private GenomeLoc position = null; + + public TribbleGATKFeature(GenomeLocParser genomeLocParser,Feature f, String name) { + super(name); + this.genomeLocParser = genomeLocParser; + feature = f; + } + public GenomeLoc getLocation() { + if (position == null) position = genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd()); + return position; + } + + /** Return the features reference sequence name, e.g chromosome or contig */ + @Override + public String getChr() { + return feature.getChr(); + } + + /** Return the start position in 1-based coordinates (first base is 1) */ + @Override + public int getStart() { + return feature.getStart(); + } + + /** + * Return the end position following 1-based fully closed conventions. The length of a feature is + * end - start + 1; + */ + @Override + public int getEnd() { + return feature.getEnd(); + } + + // TODO: this should be a Feature, actually + public Object getUnderlyingObject() { + return feature; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/LocationAwareSeekableRODIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/LocationAwareSeekableRODIterator.java new file mode 100644 index 000000000..42fa9ffb1 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/LocationAwareSeekableRODIterator.java @@ -0,0 +1,49 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; + +/** + * @author aaron + *

+ * Interface LocationAwareSeekableRODIterator + *

+ * combine iteration with a position aware interface + */ +public interface LocationAwareSeekableRODIterator extends CloseableIterator { + public Object getHeader(); + + public SAMSequenceDictionary getSequenceDictionary(); + + public GenomeLoc peekNextLocation(); + + public GenomeLoc position(); + + public RODRecordList seekForward(GenomeLoc interval); +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/RMDTriplet.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/RMDTriplet.java new file mode 100644 index 000000000..3c79fc5fd --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/RMDTriplet.java @@ -0,0 +1,92 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + + +import org.broadinstitute.gatk.utils.commandline.Tags; + +/** + * a helper class to manage our triplets of data for the -B command line option (name, type, file) + * TODO: The presence of four datapoints here suggests that this class' name isn't sufficient to describe its function. Rename. + */ +public class RMDTriplet { + public enum RMDStorageType { FILE, STREAM }; + + private final String name; + private final String type; + private final String file; + private final RMDStorageType storageType; + private final Tags tags; + + public RMDTriplet(final String name, final String type, final String file, final RMDStorageType storageType, final Tags tags) { + this.name = name; + this.type = type; + this.file = file; + this.storageType = storageType; + this.tags = tags; + } + + /** + * Gets the name of this track. RefMetaDataTrackers can use this identifier to retrieve data of a certain type. + * @return Name associated with this track. + */ + public String getName() { + return name; + } + + /** + * Gets the type of this track. Informs the GATK how to parse this file type. + * @return Type associated with this track. + */ + public String getType() { + return type; + } + + /** + * Gets the filename representing this track. Data is loaded from this file. + * @return Filename of the RMD. + */ + public String getFile() { + return file; + } + + /** + * The type of storage being used for this metadata track. Right now, can be either a + * file type (can be indexed) or a stream type (can't be indexed). + * @return Storage type for this RMD 'triplet'. + */ + public RMDStorageType getStorageType() { + return storageType; + } + + /** + * Gets the key=value tags associated with this track + * @return Tags associated with this track. + */ + public Tags getTags() { + return tags; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/RODRecordList.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/RODRecordList.java new file mode 100644 index 000000000..025835275 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/refdata/utils/RODRecordList.java @@ -0,0 +1,45 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.HasGenomeLocation; + +import java.util.List; + + +/** + * @author aaron + *

+ * Class RODRecordList + *

+ * make the RODRecord list an interface, so we can stub in other implementations + * during testing. + */ +public interface RODRecordList extends List, Comparable, HasGenomeLocation { + public GenomeLoc getLocation(); + public String getName(); +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReport.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReport.java new file mode 100644 index 000000000..056581351 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReport.java @@ -0,0 +1,376 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.*; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +/** + * Container class for GATK report tables + */ +public class GATKReport { + public static final String GATKREPORT_HEADER_PREFIX = "#:GATKReport."; + public static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V1_1; + private static final String SEPARATOR = ":"; + private GATKReportVersion version = LATEST_REPORT_VERSION; + + private final TreeMap tables = new TreeMap(); + + /** + * Create a new, empty GATKReport. + */ + public GATKReport() { + } + + /** + * Create a new GATKReport with the contents of a GATKReport on disk. + * + * @param filename the path to the file to load + */ + public GATKReport(String filename) { + this(new File(filename)); + } + + /** + * Create a new GATKReport with the contents of a GATKReport on disk. + * + * @param file the file to load + */ + public GATKReport(File file) { + loadReport(file); + } + + /** + * Create a new GATK report from GATK report tables + * @param tables Any number of tables that you want to add to the report + */ + public GATKReport(GATKReportTable... tables) { + for( GATKReportTable table: tables) + addTable(table); + } + + /** + * Load a GATKReport file from disk + * + * @param file the file to load + */ + private void loadReport(File file) { + BufferedReader reader; + String reportHeader; + try { + reader = new BufferedReader(new FileReader(file)); + reportHeader = reader.readLine(); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotReadInputFile(file, "it does not exist"); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + + + // Read the first line for the version and number of tables. + version = GATKReportVersion.fromHeader(reportHeader); + if (version.equals(GATKReportVersion.V0_1) || + version.equals(GATKReportVersion.V0_2)) + throw new UserException("The GATK no longer supports reading legacy GATK Reports. Please use v1.0 or newer."); + + int nTables = Integer.parseInt(reportHeader.split(":")[2]); + + // Read each table according ot the number of tables + for (int i = 0; i < nTables; i++) { + addTable(new GATKReportTable(reader, version)); + } + } + + /** + * Add a new, empty table to the report + * + * @param tableName the name of the table + * @param tableDescription the description of the table + * @param numColumns the number of columns in this table + */ + public void addTable(final String tableName, final String tableDescription, final int numColumns) { + addTable(tableName, tableDescription, numColumns, GATKReportTable.TableSortingWay.DO_NOT_SORT); + } + + /** + * Add a new, empty table to the report + * + * @param tableName the name of the table + * @param tableDescription the description of the table + * @param numColumns the number of columns in this table + * @param sortingWay way to sort table + */ + public void addTable(final String tableName, final String tableDescription, final int numColumns, final GATKReportTable.TableSortingWay sortingWay) { + GATKReportTable table = new GATKReportTable(tableName, tableDescription, numColumns, sortingWay); + tables.put(tableName, table); + } + + /** + * Adds a table, empty or populated, to the report + * + * @param table the table to add + */ + public void addTable(GATKReportTable table) { + tables.put(table.getTableName(), table); + } + + public void addTables(List gatkReportTableV2s) { + for ( GATKReportTable table : gatkReportTableV2s ) + addTable(table); + } + + /** + * Return true if table with a given name exists + * + * @param tableName the name of the table + * @return true if the table exists, false otherwise + */ + public boolean hasTable(String tableName) { + return tables.containsKey(tableName); + } + + /** + * Return a table with a given name + * + * @param tableName the name of the table + * @return the table object + */ + public GATKReportTable getTable(String tableName) { + GATKReportTable table = tables.get(tableName); + if (table == null) + throw new ReviewedGATKException("Table is not in GATKReport: " + tableName); + return table; + } + + /** + * Print all tables contained within this container to a PrintStream + * + * @param out the PrintStream to which the tables should be written + */ + public void print(PrintStream out) { + out.println(GATKREPORT_HEADER_PREFIX + getVersion().toString() + SEPARATOR + getTables().size()); + for (GATKReportTable table : tables.values()) + table.write(out); + } + + public Collection getTables() { + return tables.values(); + } + + /** + * This is the main function is charge of gathering the reports. It checks that the reports are compatible and then + * calls the table gathering functions. + * + * @param input another GATKReport of the same format + */ + public void concat(GATKReport input) { + + if ( !isSameFormat(input) ) { + throw new ReviewedGATKException("Failed to combine GATKReport, format doesn't match!"); + } + + for ( Map.Entry table : tables.entrySet() ) { + table.getValue().concat(input.getTable(table.getKey())); + } + } + + public GATKReportVersion getVersion() { + return version; + } + + /** + * Returns whether or not the two reports have the same format, from columns, to tables, to reports, and everything + * in between. This does not check if the data inside is the same. This is the check to see if the two reports are + * gatherable or reduceable. + * + * @param report another GATK report + * @return true if the the reports are gatherable + */ + public boolean isSameFormat(GATKReport report) { + if (!version.equals(report.version)) { + return false; + } + if (!tables.keySet().equals(report.tables.keySet())) { + return false; + } + for (String tableName : tables.keySet()) { + if (!getTable(tableName).isSameFormat(report.getTable(tableName))) + return false; + } + return true; + } + + /** + * Checks that the reports are exactly the same. + * + * @param report another GATK report + * @return true if all field in the reports, tables, and columns are equal. + */ + public boolean equals(GATKReport report) { + if (!version.equals(report.version)) { + return false; + } + if (!tables.keySet().equals(report.tables.keySet())) { + return false; + } + for (String tableName : tables.keySet()) { + if (!getTable(tableName).equals(report.getTable(tableName))) + return false; + } + return true; + } + + /** + * The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need + * the advanced functionality of a full GATK Report. + *

+ * A simple GATK Report consists of: + *

+ * - A single table + * - No primary key ( it is hidden ) + *

+ * Optional: + * - Only untyped columns. As long as the data is an Object, it will be accepted. + * - Default column values being empty strings. + *

+ * Limitations: + *

+ * - A simple GATK report cannot contain multiple tables. + * - It cannot contain typed columns, which prevents arithmetic gathering. + * + * @param tableName The name of your simple GATK report table + * @param columns The names of the columns in your table + * @return a simplified GATK report + */ + public static GATKReport newSimpleReport(final String tableName, final String... columns) { + return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns); + } + + /** + * @see #newSimpleReport(String, String...) but with a customized description + * @param tableName + * @param desc + * @param columns + * @return + */ + public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) { + GATKReportTable table = new GATKReportTable(tableName, desc, columns.length); + + for (String column : columns) { + table.addColumn(column, ""); + } + + GATKReport output = new GATKReport(); + output.addTable(table); + + return output; + } + + /** + * The constructor for a simplified GATK Report. Simplified GATK report are designed for reports that do not need + * the advanced functionality of a full GATK Report. + *

+ * A simple GATK Report consists of: + *

+ * - A single table + * - No primary key ( it is hidden ) + *

+ * Optional: + * - Only untyped columns. As long as the data is an Object, it will be accepted. + * - Default column values being empty strings. + *

+ * Limitations: + *

+ * - A simple GATK report cannot contain multiple tables. + * - It cannot contain typed columns, which prevents arithmetic gathering. + * + * @param tableName The name of your simple GATK report table + * @param columns The names of the columns in your table + * @return a simplified GATK report + */ + public static GATKReport newSimpleReport(final String tableName, final List columns) { + GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.size()); + + for (String column : columns) { + table.addColumn(column, ""); + } + + GATKReport output = new GATKReport(); + output.addTable(table); + + return output; + } + + /** + * This method provides an efficient way to populate a simplified GATK report. This method will only work on reports + * that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information. + * + * @param values the row of data to be added to the table. + * Note: the number of arguments must match the columns in the table. + */ + public void addRow(final Object... values) { + // Must be a simple report + if ( tables.size() != 1 ) + throw new ReviewedGATKException("Cannot write a row to a complex GATK Report"); + + GATKReportTable table = tables.firstEntry().getValue(); + if ( table.getNumColumns() != values.length ) + throw new ReviewedGATKException("The number of arguments in writeRow (" + values.length + ") must match the number of columns in the table (" + table.getNumColumns() + ")" ); + + final int rowIndex = table.getNumRows(); + for ( int i = 0; i < values.length; i++ ) + table.set(rowIndex, i, values[i]); + } + + /** + * This method provides an efficient way to populate a simplified GATK report. This method will only work on reports + * that qualify as simplified GATK reports. See the newSimpleReport() constructor for more information. + * + * @param values the row of data to be added to the table. + * Note: the number of arguments must match the columns in the table. + */ + public void addRowList(final List values) { + if ( tables.size() != 1 ) + throw new ReviewedGATKException("Cannot write a row to a complex GATK Report"); + + GATKReportTable table = tables.firstEntry().getValue(); + if ( table.getNumColumns() != values.size() ) + throw new ReviewedGATKException("The number of arguments in writeRow() must match the number of columns in the table"); + + final int rowIndex = table.getNumRows(); + int idx = 0; + for ( Object value : values ) { + table.set(rowIndex,idx,value); + idx++; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportColumn.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportColumn.java new file mode 100644 index 000000000..d672c1ba8 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportColumn.java @@ -0,0 +1,147 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +import org.apache.commons.lang.math.NumberUtils; + +import java.util.Arrays; +import java.util.Collection; + +/** + * column information within a GATK report table + */ +public class GATKReportColumn { + final private String columnName; + final private String format; + final private GATKReportDataType dataType; + + private GATKReportColumnFormat columnFormat; + private GATKReportColumnFormat.Alignment alignment = GATKReportColumnFormat.Alignment.RIGHT; // default alignment is to the right unless values added ask for a left alignment + private int maxWidth = 0; + + /** + * Construct the column object, specifying the column name, default value, whether or not the column should be + * displayed, and the format string. This cannot be null. + * + * @param columnName the name of the column + * @param format format string + */ + public GATKReportColumn(final String columnName, final String format) { + this.columnName = columnName; + this.maxWidth = columnName.length(); + if ( format.equals("") ) { + this.format = "%s"; + this.dataType = GATKReportDataType.Unknown; + } + else { + this.format = format; + this.dataType = GATKReportDataType.fromFormatString(format); + } + } + + /** + * Get the display width for this column. This allows the entire column to be displayed with the appropriate, fixed + * width. + * + * @return the format string for this column + */ + public GATKReportColumnFormat getColumnFormat() { + if (columnFormat != null) + return columnFormat; + + columnFormat = new GATKReportColumnFormat(maxWidth, alignment); + return columnFormat; + } + + private static final Collection RIGHT_ALIGN_STRINGS = Arrays.asList( + "null", + "NA", + String.valueOf(Double.POSITIVE_INFINITY), + String.valueOf(Double.NEGATIVE_INFINITY), + String.valueOf(Double.NaN)); + + /** + * Check if the value can be right aligned. Does not trim the values before checking if numeric since it assumes + * the spaces mean that the value is already padded. + * + * @param value to check + * @return true if the value is a right alignable + */ + protected static boolean isRightAlign(final String value) { + return value == null || RIGHT_ALIGN_STRINGS.contains(value) || NumberUtils.isNumber(value.trim()); + } + + /** + * Returns a string version of the values. + * + * @param obj The object to convert to a string + * @return The string representation of the column + */ + private String formatValue(final Object obj) { + String value; + if (obj == null) { + value = "null"; + } + else if ( dataType.equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) { + value = String.format("%.8f", obj); + } + else + value = String.format(format, obj); + + return value; + } + + public GATKReportDataType getDataType() { + return dataType; + } + + public String getColumnName() { + return columnName; + } + + public String getFormat() { + return dataType.equals(GATKReportDataType.Unknown) ? "%s" : format; + } + + public void updateFormatting(final Object value) { + if (value != null) { + final String formatted = formatValue(value); + if ( formatted.length() > 0 ) { + updateMaxWidth(formatted); + updateFormat(formatted); + } + } + } + + private void updateMaxWidth(final String formatted) { + maxWidth = Math.max(formatted.length(), maxWidth); + } + + private void updateFormat(final String formatted) { + if (alignment == GATKReportColumnFormat.Alignment.RIGHT) + alignment = isRightAlign(formatted) ? GATKReportColumnFormat.Alignment.RIGHT : GATKReportColumnFormat.Alignment.LEFT; + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportColumnFormat.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportColumnFormat.java new file mode 100644 index 000000000..97c012a5a --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportColumnFormat.java @@ -0,0 +1,63 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +/** + * Column width and left/right alignment. + */ +public class GATKReportColumnFormat { + public static enum Alignment { LEFT, RIGHT } + private final int width; + private final Alignment alignment; + + public GATKReportColumnFormat(int width, Alignment alignment) { + this.width = width; + this.alignment = alignment; + } + + public int getWidth() { + return width; + } + + public Alignment getAlignment() { + return alignment; + } + + public String getNameFormat() { + return "%-" + width + "s"; + } + + public String getValueFormat() { + switch (alignment) { + case LEFT: + return "%-" + width + "s"; + case RIGHT: + return "%" + width + "s"; + default: + throw new UnsupportedOperationException("Unknown alignment: " + alignment); + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportDataType.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportDataType.java new file mode 100644 index 000000000..d522dff35 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportDataType.java @@ -0,0 +1,236 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; + +/** + * The gatherable data types acceptable in a GATK report column. + */ +public enum GATKReportDataType { + /** + * The null type should not be used. + */ + Null("Null"), + + /** + * The default value when a format string is not present + */ + Unknown("Unknown"), + + /** + * Used for boolean values. Will display as true or false in the table. + */ + Boolean("%[Bb]"), + + /** + * Used for char values. Will display as a char so use printable values! + */ + Character("%[Cc]"), + + /** + * Used for float and double values. Will output a decimal with format %.8f unless otherwise specified. + */ + Decimal("%.*[EeFf]"), + + /** + * Used for int, byte, short, and long values. Will display the full number by default. + */ + Integer("%[Dd]"), + + /** + * Used for string values. Displays the string itself. + */ + String("%[Ss]"); + + private final String dataTypeString; + + private GATKReportDataType(String dataTypeString) { + this.dataTypeString = dataTypeString; + } + + private static final Map lookup = new HashMap(); + + static { + for (GATKReportDataType s : EnumSet.allOf(GATKReportDataType.class)) + lookup.put(s.dataTypeString, s); + } + + + @Override + public String toString() { + return this.dataTypeString; + } + + /** + * Returns a GATK report data type from the Object specified. It looks through the list of acceptable classes and + * returns the appropriate data type. + * + * @param object the object ot derive the data type from + * @return the appropriate data type + */ + public static GATKReportDataType fromObject(Object object) { + GATKReportDataType value; + if (object instanceof Boolean) { + value = GATKReportDataType.Boolean; + + } else if (object instanceof Character) { + value = GATKReportDataType.Character; + + } else if (object instanceof Float || + object instanceof Double) { + value = GATKReportDataType.Decimal; + + } else if (object instanceof Integer || + object instanceof Long || + object instanceof Short || + object instanceof Byte ) { + value = GATKReportDataType.Integer; + + } else if (object instanceof String) { + value = GATKReportDataType.String; + + } else { + value = GATKReportDataType.Unknown; + //throw new UserException("GATKReport could not convert the data object into a GATKReportDataType. Acceptable data objects are found in the documentation."); + } + return value; + } + + /** + * Returns a GATK report data type from the format string specified. It uses regex matching from the enumerated + * Strings. + * + * @param format the format string to derive the data type from + * @return the appropriate data type + */ + public static GATKReportDataType fromFormatString(String format) { + if (format.equals("")) + return Unknown; + for (GATKReportDataType type : lookup.values()) { + if (format.matches(type.toString()) ) + return type; + } + return Unknown; + } + + /** + * Returns the default value of the data type. It returns an object that matches the class of the data type. + * + * @return an object that matches the data type + */ + public Object getDefaultValue() { + switch (this) { + case Decimal: + return 0.0D; + case Boolean: + return false; + case Character: + return '0'; + case Integer: + return 0L; + case String: + return ""; + default: + return null; + } + } + + /** + * Checks if the two objects are equal using the appropriate test form the data types. + * + * @param a an object + * @param b another object to check if equal + * @return true - the objects are equal, false - the objects are nto equal + */ + public boolean isEqual(Object a, Object b) { + switch (this) { + case Null: + return true; + case Decimal: + case Boolean: + case Integer: + return a.toString().equals(b.toString()); + case Character: + case String: + default: + return a.equals(b); + } + } + + /** + * Converts an input String to the appropriate type using the data type. Used for parsing loading a GATK report from + * file. + * + * @param obj The input string + * @return an object that matches the data type. + */ + Object Parse(Object obj) { + if (obj instanceof String) { + String str = obj.toString(); + switch (this) { + case Decimal: + return Double.parseDouble(str); + case Boolean: + return java.lang.Boolean.parseBoolean(str); + case Integer: + return Long.parseLong(str); + case String: + return str; + case Character: + return str.toCharArray()[0]; + default: + return str; + } + } else + return null; + } + + /** + * Returns a format string version of the value according to the data type. + * + * @return The printf string representation of the object according to data type. + */ + public String getDefaultFormatString() { + switch (this) { + case Decimal: + return "%.8f"; + case Boolean: + return "%b"; + case Integer: + return "%d"; + case String: + return "%s"; + case Character: + return "%c"; + case Null: + default: + return "%s"; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportGatherer.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportGatherer.java new file mode 100644 index 000000000..359460bd0 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportGatherer.java @@ -0,0 +1,62 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +import org.broadinstitute.gatk.utils.commandline.Gatherer; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.PrintStream; +import java.util.List; + +public class GATKReportGatherer extends Gatherer { + @Override + public void gather(List inputs, File output) { + //Combines inputs GATKReport to one output + + PrintStream o; + try { + o = new PrintStream(output); + } catch (FileNotFoundException e) { + throw new UserException(String.format("File %s to be output by GATKReportGatherer function was not found", output)); + } + + GATKReport current = new GATKReport(); + boolean isFirst = true; + for (File input : inputs) { + if (isFirst) { + current = new GATKReport(input); + isFirst = false; + } else { + current.concat(new GATKReport(input)); + } + } + + current.print(o); + o.close(); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportTable.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportTable.java new file mode 100644 index 000000000..018d05500 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportTable.java @@ -0,0 +1,779 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.text.TextFormattingUtils; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.PrintStream; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class GATKReportTable { + /** + * REGEX that matches any table with an invalid name + */ + public static final String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]"; + private static final String GATKTABLE_HEADER_PREFIX = "#:GATKTable"; + private static final String SEPARATOR = ":"; + private static final String ENDLINE = ":;"; + + private final String tableName; + private final String tableDescription; + + private final TableSortingWay sortingWay; + + private List underlyingData; + private final List columnInfo; + private final Map columnNameToIndex; + private final HashMap rowIdToIndex; + + private static final String COULD_NOT_READ_HEADER = "Could not read the header of this file -- "; + private static final String COULD_NOT_READ_COLUMN_NAMES = "Could not read the column names of this file -- "; + private static final String COULD_NOT_READ_DATA_LINE = "Could not read a data line of this table -- "; + private static final String COULD_NOT_READ_EMPTY_LINE = "Could not read the last empty line of this table -- "; + private static final String OLD_GATK_TABLE_VERSION = "We no longer support older versions of the GATK Tables"; + + private static final int INITITAL_ARRAY_SIZE = 10000; + private static final String NUMBER_CONVERSION_EXCEPTION = "String is a number but is not a long or a double: "; + + protected enum TableDataHeaderFields { + COLS(2), + ROWS(3), + FORMAT_START(4); + + private final int index; + TableDataHeaderFields(int index) { this.index = index; } + public int index() { return index; } + } + + public enum TableSortingWay { + SORT_BY_ROW, + SORT_BY_COLUMN, + DO_NOT_SORT + } + + protected enum TableNameHeaderFields { + NAME(2), + DESCRIPTION(3); + + private final int index; + TableNameHeaderFields(int index) { this.index = index; } + public int index() { return index; } + } + + /** + * Construct a new GATK report table from the reader + * Note that the row ID mappings are just the index -> index + * + * @param reader the reader + * @param version the GATK report version + */ + public GATKReportTable(BufferedReader reader, GATKReportVersion version) { + + switch ( version ) { + case V1_1: + // read in the header lines + final String[] tableData, tableNameData; + try { + tableData = reader.readLine().split(SEPARATOR); + tableNameData = reader.readLine().split(SEPARATOR); + } catch (IOException e) { + throw new ReviewedGATKException(COULD_NOT_READ_HEADER + e.getMessage()); + } + + // parse the header fields + tableName = tableNameData[TableNameHeaderFields.NAME.index()]; + tableDescription = (tableNameData.length <= TableNameHeaderFields.DESCRIPTION.index()) ? "" : tableNameData[TableNameHeaderFields.DESCRIPTION.index()]; // table may have no description! (and that's okay) + + // when reading from a file, we do not re-sort the rows + sortingWay = TableSortingWay.DO_NOT_SORT; + + // initialize the data + final int nColumns = Integer.parseInt(tableData[TableDataHeaderFields.COLS.index()]); + final int nRows = Integer.parseInt(tableData[TableDataHeaderFields.ROWS.index()]); + underlyingData = new ArrayList(nRows); + columnInfo = new ArrayList(nColumns); + columnNameToIndex = new HashMap(nColumns); + + // when reading from a file, the row ID mapping is just the index + rowIdToIndex = new HashMap(); + for ( int i = 0; i < nRows; i++ ) + rowIdToIndex.put(i, i); + + // read the column names + final String columnLine; + try { + columnLine = reader.readLine(); + } catch (IOException e) { + throw new ReviewedGATKException(COULD_NOT_READ_COLUMN_NAMES); + } + + final List columnStarts = TextFormattingUtils.getWordStarts(columnLine); + final String[] columnNames = TextFormattingUtils.splitFixedWidth(columnLine, columnStarts); + + // Put in columns using the format string from the header + for ( int i = 0; i < nColumns; i++ ) { + final String format = tableData[TableDataHeaderFields.FORMAT_START.index() + i]; + addColumn(columnNames[i], format); + } + + // fill in the table + try { + for ( int i = 0; i < nRows; i++ ) { + // read a data line + final String dataLine = reader.readLine(); + final List lineSplits = Arrays.asList(TextFormattingUtils.splitFixedWidth(dataLine, columnStarts)); + + underlyingData.add(new Object[nColumns]); + for ( int columnIndex = 0; columnIndex < nColumns; columnIndex++ ) { + + final GATKReportDataType type = columnInfo.get(columnIndex).getDataType(); + final String columnName = columnNames[columnIndex]; + set(i, columnName, type.Parse(lineSplits.get(columnIndex))); + + } + } + } catch (IOException e) { + throw new ReviewedGATKException(COULD_NOT_READ_DATA_LINE + e.getMessage()); + } + + try { + reader.readLine(); + } catch (IOException e) { + throw new ReviewedGATKException(COULD_NOT_READ_EMPTY_LINE + e.getMessage()); + } + break; + + default: + throw new ReviewedGATKException(OLD_GATK_TABLE_VERSION); + } + } + + /** + * Construct a new GATK report table with the specified name and description + * + * @param tableName the name of the table + * @param tableDescription the description of the table + * @param numColumns the number of columns in this table + */ + public GATKReportTable(final String tableName, final String tableDescription, final int numColumns) { + this(tableName, tableDescription, numColumns, TableSortingWay.SORT_BY_ROW); + } + + /** + * Construct a new GATK report table with the specified name and description and whether to sort rows by the row ID. + * + * @param tableName the name of the table + * @param tableDescription the description of the table + * @param numColumns the number of columns in this table + * @param sortingWay in what way to sort rows (instead of the order in which they were added) + */ + public GATKReportTable(final String tableName, final String tableDescription, final int numColumns, final TableSortingWay sortingWay) { + if ( !isValidName(tableName) ) { + throw new ReviewedGATKException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); + } + + if ( !isValidDescription(tableDescription) ) { + throw new ReviewedGATKException("Attempted to set a GATKReportTable description of '" + tableDescription + "'. GATKReportTable descriptions must not contain newlines."); + } + + this.tableName = tableName; + this.tableDescription = tableDescription; + this.sortingWay = sortingWay; + + underlyingData = new ArrayList(INITITAL_ARRAY_SIZE); + columnInfo = new ArrayList(numColumns); + columnNameToIndex = new HashMap(numColumns); + rowIdToIndex = new HashMap(); + } + + /** + * Create a new GATKReportTable with the same structure + * @param tableToCopy + */ + public GATKReportTable(final GATKReportTable tableToCopy, final boolean copyData) { + this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortingWay); + for ( final GATKReportColumn column : tableToCopy.getColumnInfo() ) + addColumn(column.getColumnName(), column.getFormat()); + if ( copyData ) + throw new IllegalArgumentException("sorry, copying data in GATKReportTable isn't supported"); + } + + /** + * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed + * + * @param name the name of the table or column + * @return true if the name is valid, false if otherwise + */ + private boolean isValidName(String name) { + Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX); + Matcher m = p.matcher(name); + + return !m.find(); + } + + /** + * Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed + * + * @param description the name of the table or column + * @return true if the name is valid, false if otherwise + */ + private boolean isValidDescription(String description) { + Pattern p = Pattern.compile("\\r|\\n"); + Matcher m = p.matcher(description); + + return !m.find(); + } + + /** + * Add a mapping from ID to the index of a new row added to the table. + * + * @param ID the unique ID + */ + public void addRowID(final String ID) { + addRowID(ID, false); + } + + /** + * Add a mapping from ID to the index of a new row added to the table. + * + * @param ID the unique ID + * @param populateFirstColumn should we automatically populate the first column with the row's ID? + */ + public void addRowID(final String ID, final boolean populateFirstColumn) { + addRowIDMapping(ID, underlyingData.size(), populateFirstColumn); + } + + /** + * Add a mapping from ID to row index. + * + * @param ID the unique ID + * @param index the index associated with the ID + */ + public void addRowIDMapping(final String ID, final int index) { + addRowIDMapping(ID, index, false); + } + + /** + * Add a mapping from ID to row index. + * + * @param ID the unique ID + * @param index the index associated with the ID + * @param populateFirstColumn should we automatically populate the first column with the row's ID? + */ + public void addRowIDMapping(final Object ID, final int index, final boolean populateFirstColumn) { + expandTo(index, false); + rowIdToIndex.put(ID, index); + + if ( populateFirstColumn ) + set(index, 0, ID); + } + + /** + * Remove a mapping from ID to row index. + * + * @param ID the row ID + */ + public void removeRowIDMapping(final Object ID) { + rowIdToIndex.remove(ID); + } + + /** + * Add a column to the report + * + * @param columnName the name of the column + */ + public void addColumn(String columnName) { + addColumn(columnName, ""); + } + + /** + * Add a column to the report and the format string used to display the data. + * + * @param columnName the name of the column + * @param format the format string used to display data + */ + public void addColumn(String columnName, String format) { + columnNameToIndex.put(columnName, columnInfo.size()); + columnInfo.add(new GATKReportColumn(columnName, format)); + } + + /** + * Check if the requested cell is valid and expand the table if necessary + * + * @param rowIndex the row index + * @param colIndex the column index + */ + private void verifyEntry(final int rowIndex, final int colIndex) { + if ( rowIndex < 0 || colIndex < 0 || colIndex >= getNumColumns() ) + throw new ReviewedGATKException("attempted to access a cell that does not exist in table '" + tableName + "'"); + } + + /** + * expand the underlying table if needed to include the given row index + * + * @param rowIndex the row index + * @param updateRowIdMap should we update the row ID map? + */ + private void expandTo(final int rowIndex, final boolean updateRowIdMap) { + int currentSize = underlyingData.size(); + if ( rowIndex >= currentSize ) { + final int numNewRows = rowIndex - currentSize + 1; + for ( int i = 0; i < numNewRows; i++ ) { + if ( updateRowIdMap ) + rowIdToIndex.put(currentSize, currentSize); + underlyingData.add(new Object[getNumColumns()]); + currentSize++; + } + } + } + + /** + * Set the value for a given position in the table. + * If the row ID doesn't exist, it will create a new row in the table with the given ID. + * + * @param rowID the row ID + * @param columnName the name of the column + * @param value the value to set + */ + public void set(final Object rowID, final String columnName, final Object value) { + if ( !rowIdToIndex.containsKey(rowID) ) { + rowIdToIndex.put(rowID, underlyingData.size()); + expandTo(underlyingData.size(), false); + } + set(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName), value); + } + + /** + * Set the value for a given position in the table. + * If the row index doesn't exist, it will create new rows in the table accordingly. + * + * @param rowIndex the row index + * @param colIndex the column index + * @param value the value to set + */ + public void set(final int rowIndex, final int colIndex, Object value) { + expandTo(rowIndex, true); + verifyEntry(rowIndex, colIndex); + GATKReportColumn column = columnInfo.get(colIndex); + + // We do not accept internal null values + if (value == null) + value = "null"; + else + value = fixType(value, column); + + if ( column.getDataType().equals(GATKReportDataType.fromObject(value)) || column.getDataType().equals(GATKReportDataType.Unknown) ) { + underlyingData.get(rowIndex)[colIndex] = value; + column.updateFormatting(value); + } else { + throw new ReviewedGATKException(String.format("Tried to add an object of type: %s to a column of type: %s", GATKReportDataType.fromObject(value).name(), column.getDataType().name())); + } + } + + /** + * Returns true if the table contains a row mapping with the given ID + * + * @param rowID the row ID + */ + public boolean containsRowID(final Object rowID) { + return rowIdToIndex.containsKey(rowID); + } + + /** + * Returns the row mapping IDs + * + */ + public Collection getRowIDs() { + return rowIdToIndex.keySet(); + } + + /** + * Increment the value for a given position in the table. + * Throws an exception if the value in the cell is not an integer. + * + * @param rowID the row ID + * @param columnName the name of the column + */ + public void increment(final Object rowID, final String columnName) { + int prevValue; + if ( !rowIdToIndex.containsKey(rowID) ) { + rowIdToIndex.put(rowID, underlyingData.size()); + underlyingData.add(new Object[getNumColumns()]); + prevValue = 0; + } else { + Object obj = get(rowID, columnName); + if ( !(obj instanceof Integer) ) + throw new ReviewedGATKException("Attempting to increment a value in a cell that is not an integer"); + prevValue = (Integer)obj; + } + + set(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName), prevValue + 1); + } + + /** + * Returns the index of the first row matching the column values. + * Ex: "CountVariants", "dbsnp", "eval", "called", "all", "novel", "all" + * + * @param columnValues column values. + * @return The index of the first row matching the column values or -1 if no such row exists. + */ + public int findRowByData(final Object... columnValues) { + if ( columnValues == null || columnValues.length == 0 || columnValues.length > getNumColumns() ) + return -1; + + for ( int rowIndex = 0; rowIndex < underlyingData.size(); rowIndex++ ) { + + final Object[] row = underlyingData.get(rowIndex); + + boolean matches = true; + for ( int colIndex = 0; colIndex < columnValues.length; colIndex++ ) { + if ( !columnValues[colIndex].equals(row[colIndex]) ) { + matches = false; + break; + } + } + + if ( matches ) + return rowIndex; + } + + return -1; + } + + private Object fixType(final Object value, final GATKReportColumn column) { + // Below is some code to convert a string into its appropriate type. + + // todo -- Types have to be more flexible. For example, %d should accept Integers, Shorts and Bytes. + + Object newValue = null; + if ( value instanceof String && !column.getDataType().equals(GATKReportDataType.String) ) { + // Integer case + if ( column.getDataType().equals(GATKReportDataType.Integer) ) { + try { + newValue = Long.parseLong((String) value); + } catch (Exception e) { + /** do nothing */ + } + } + if ( column.getDataType().equals(GATKReportDataType.Decimal) ) { + try { + newValue = Double.parseDouble((String) value); + } catch (Exception e) { + /** do nothing */ + } + } + if ( column.getDataType().equals(GATKReportDataType.Character) && ((String) value).length() == 1 ) { + newValue = ((String) value).charAt(0); + } + } + + return (newValue != null) ? newValue : value; + } + + /** + * Get a value from the given position in the table + * + * @param rowID the row ID + * @param columnName the name of the column + * @return the value stored at the specified position in the table + */ + public Object get(final Object rowID, final String columnName) { + return get(rowIdToIndex.get(rowID), columnNameToIndex.get(columnName)); + } + + /** + * Get a value from the given position in the table + * + * @param rowIndex the row ID + * @param columnName the name of the column + * @return the value stored at the specified position in the table + */ + public Object get(final int rowIndex, final String columnName) { + return get(rowIndex, columnNameToIndex.get(columnName)); + } + + /** + * Get a value from the given position in the table + * + * @param rowIndex the index of the row + * @param columnIndex the index of the column + * @return the value stored at the specified position in the table + */ + public Object get(int rowIndex, int columnIndex) { + verifyEntry(rowIndex, columnIndex); + return underlyingData.get(rowIndex)[columnIndex]; + } + + /** + * Write the table to the PrintStream, formatted nicely to be human-readable, AWK-able, and R-friendly. + * + * @param out the PrintStream to which the table should be written + */ + void write(final PrintStream out) { + + /* + * Table header: + * #:GATKTable:nColumns:nRows:(DataType for each column):; + * #:GATKTable:TableName:Description :; + * key colA colB + * row1 xxxx xxxxx + */ + + // write the table definition + out.printf(GATKTABLE_HEADER_PREFIX + ":%d:%d", getNumColumns(), getNumRows()); + + // write the formats for all the columns + for ( final GATKReportColumn column : columnInfo ) + out.print(SEPARATOR + column.getFormat()); + out.println(ENDLINE); + + // write the table name & description + out.printf(GATKTABLE_HEADER_PREFIX + ":%s:%s\n", tableName, tableDescription); + + // write the column names + boolean needsPadding = false; + for ( final GATKReportColumn column : columnInfo ) { + if ( needsPadding ) + out.printf(" "); + needsPadding = true; + + out.printf(column.getColumnFormat().getNameFormat(), column.getColumnName()); + } + out.println(); + + // write the table body + switch (sortingWay) { + case SORT_BY_COLUMN: + Collections.sort(underlyingData, new Comparator() { + //INVARIANT the two arrays are of the same length and corresponding elements are of the same type + @Override + public int compare(Object[] objectArr1, Object[] objectArr2) { + final int EQUAL = 0; + + int result = EQUAL; + + int l = objectArr1.length; + for (int x = 0; x < l; x++) { + if (objectArr1[x] instanceof Integer) { + result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + } else if (objectArr1[x] instanceof Double) { + result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); + } else { // default uses String comparison + result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); + } + if( result != EQUAL) { + return result; + } + } + return result; + } + }); + for ( final Object[] row : underlyingData ) + writeRow(out, row); + break; + case SORT_BY_ROW: + // make sure that there are exactly the correct number of ID mappings + if ( rowIdToIndex.size() != underlyingData.size() ) + throw new ReviewedGATKException("There isn't a 1-to-1 mapping from row ID to index; this can happen when rows are not created consistently"); + + final TreeMap sortedMap; + try { + sortedMap = new TreeMap(rowIdToIndex); + } catch (ClassCastException e) { + throw new ReviewedGATKException("Unable to sort the rows based on the row IDs because the ID Objects are of different types"); + } + for ( final Map.Entry rowKey : sortedMap.entrySet() ) + writeRow(out, underlyingData.get(rowKey.getValue())); + break; + case DO_NOT_SORT: + for ( final Object[] row : underlyingData ) + writeRow(out, row); + } + out.println(); + } + + private void writeRow(final PrintStream out, final Object[] row) { + boolean needsPadding = false; + for ( int i = 0; i < row.length; i++ ) { + if ( needsPadding ) + out.printf(" "); + needsPadding = true; + + final Object obj = row[i]; + final String value; + + final GATKReportColumn info = columnInfo.get(i); + + if ( obj == null ) + value = "null"; + else if ( info.getDataType().equals(GATKReportDataType.Unknown) && (obj instanceof Double || obj instanceof Float) ) + value = String.format("%.8f", obj); + else + value = String.format(info.getFormat(), obj); + + out.printf(info.getColumnFormat().getValueFormat(), value); + } + + out.println(); + } + + public int getNumRows() { + return underlyingData.size(); + } + + public int getNumColumns() { + return columnInfo.size(); + } + + public List getColumnInfo() { + return columnInfo; + } + + public String getTableName() { + return tableName; + } + + public String getTableDescription() { + return tableDescription; + } + + /** + * Concatenates the rows from the table to this one + * + * @param table another GATK table + */ + public void concat(final GATKReportTable table) { + if ( !isSameFormat(table) ) + throw new ReviewedGATKException("Error trying to concatenate tables with different formats"); + + // add the data + underlyingData.addAll(table.underlyingData); + + // update the row index map + final int currentNumRows = getNumRows(); + for ( Map.Entry entry : table.rowIdToIndex.entrySet() ) + rowIdToIndex.put(entry.getKey(), entry.getValue() + currentNumRows); + } + + /** + * Returns whether or not the two tables have the same format including columns and everything in between. This does + * not check if the data inside is the same. This is the check to see if the two tables are gatherable or + * reduceable + * + * @param table another GATK table + * @return true if the the tables are gatherable + */ + public boolean isSameFormat(final GATKReportTable table) { + if ( !tableName.equals(table.tableName) || + !tableDescription.equals(table.tableDescription) || + columnInfo.size() != table.columnInfo.size() ) + return false; + + for ( int i = 0; i < columnInfo.size(); i++ ) { + if ( !columnInfo.get(i).getFormat().equals(table.columnInfo.get(i).getFormat()) || + !columnInfo.get(i).getColumnName().equals(table.columnInfo.get(i).getColumnName()) ) + return false; + } + + return true; + } + + /** + * Checks that the tables are exactly the same. + * + * @param table another GATK report + * @return true if all field in the reports, tables, and columns are equal. + */ + public boolean equals(final GATKReportTable table) { + if ( !isSameFormat(table) || + underlyingData.size() != table.underlyingData.size() ) + return false; + + final List myOrderedRows = getOrderedRows(); + final List otherOrderedRows = table.getOrderedRows(); + + for ( int i = 0; i < underlyingData.size(); i++ ) { + final Object[] myData = myOrderedRows.get(i); + final Object[] otherData = otherOrderedRows.get(i); + for ( int j = 0; j < myData.length; j++ ) { + if ( !myData[j].toString().equals(otherData[j].toString()) ) // need to deal with different typing (e.g. Long vs. Integer) + return false; + } + } + + return true; + } + + private List getOrderedRows() { + + switch (sortingWay) { + case SORT_BY_COLUMN: + Collections.sort(underlyingData, new Comparator() { + //INVARIANT the two arrays are of the same length and corresponding elements are of the same type + @Override + public int compare(Object[] objectArr1, Object[] objectArr2) { + final int EQUAL = 0; + int result = EQUAL; + int l = objectArr1.length; + for (int x = 0; x < l; x++) { + if (objectArr1[x] instanceof Integer) { + result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + } else if (objectArr1[x] instanceof Double) { + result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); + } else { // default uses String comparison + result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); + } + if( result != EQUAL) { + return result; + } + } + return result; + } + }); + return underlyingData; + case SORT_BY_ROW: + final TreeMap sortedMap; + try { + sortedMap = new TreeMap(rowIdToIndex); + } catch (ClassCastException e) { + return underlyingData; + } + + final List orderedData = new ArrayList(underlyingData.size()); + for ( final int rowKey : sortedMap.values() ) + orderedData.add(underlyingData.get(rowKey)); + + return orderedData; + default: + return underlyingData; + } + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportVersion.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportVersion.java new file mode 100644 index 000000000..e87e107c0 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/report/GATKReportVersion.java @@ -0,0 +1,100 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +import org.broadinstitute.gatk.utils.exceptions.UserException; + +public enum GATKReportVersion { + /** + * Differences between other versions: + * - Does not allow spaces in cells. + * - Mostly fixed width but has a bug where the string width of floating point + * values was not measured correctly leading to columns that aren't aligned + */ + V0_1("v0.1"), + + /** + * Differences between other versions: + * - Spaces allowed in cells, for example in sample names with spaces in them ex: "C507/FG-CR 6". + * - Fixed width fixed for floating point values + */ + V0_2("v0.2"), + + /* + * Differences between v0.x + * - Added table and report headers + * - Headers changed format, include the number of tables, rows, and metadata for gathering + * - IS GATHERABLE + */ + V1_0("v1.0"), + + /* + * Differences between v1.0 + * - column numbers in header reflect the actual count of columns + * - primary keys are never displayed + */ + V1_1("v1.1"); + + private final String versionString; + + private GATKReportVersion(String versionString) { + this.versionString = versionString; + } + + @Override + public String toString() { + return versionString; + } + + public boolean equals(GATKReportVersion that) { + return (versionString.equals(that.versionString)); + } + + /** + * Returns the GATK Report Version from the file header. + * + * @param header Header from the file starting with ##:GATKReport.v[version] + * @return The version as an enum. + */ + public static GATKReportVersion fromHeader(String header) { + if ( header == null ) + throw new UserException.BadInput("The GATK report has no version specified in the header"); + + if (header.startsWith("##:GATKReport.v0.1 ")) + return GATKReportVersion.V0_1; + + if (header.startsWith("##:GATKReport.v0.2 ")) + return GATKReportVersion.V0_2; + + if (header.startsWith("#:GATKReport.v1.0")) + return GATKReportVersion.V1_0; + + if (header.startsWith("#:GATKReport.v1.1")) + return GATKReportVersion.V1_1; + + throw new UserException.BadInput("The GATK report has an unknown/unsupported version in the header: " + header); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/CapturedStreamOutput.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/CapturedStreamOutput.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/CapturedStreamOutput.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/CapturedStreamOutput.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/InputStreamSettings.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/InputStreamSettings.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/InputStreamSettings.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/InputStreamSettings.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/OutputStreamSettings.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/OutputStreamSettings.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/OutputStreamSettings.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/OutputStreamSettings.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessController.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessController.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessController.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessController.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessOutput.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessOutput.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessOutput.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessOutput.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessSettings.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessSettings.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessSettings.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/ProcessSettings.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamLocation.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamLocation.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamLocation.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamLocation.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamOutput.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamOutput.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamOutput.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/runtime/StreamOutput.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartComparator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartComparator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartComparator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartComparator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartWithNoTiesComparator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartWithNoTiesComparator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartWithNoTiesComparator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentStartWithNoTiesComparator.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/AlignmentUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilder.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialGATKSAMFileWriter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialGATKSAMFileWriter.java new file mode 100644 index 000000000..99fd76213 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialGATKSAMFileWriter.java @@ -0,0 +1,129 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.ProgressLoggerInterface; + +import java.util.ArrayList; +import java.util.List; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + *

+ * Class ArtificialGATKSAMFileWriter + *

+ * generates a fake samwriter, that you can get the output reads + * from when you're done. + */ +public class ArtificialGATKSAMFileWriter implements GATKSAMFileWriter { + + // are we closed + private boolean closed = false; + + // the SAMRecords we've added to this writer + List records = new ArrayList(); + + public void addAlignment( SAMRecord alignment ) { + records.add(alignment); + } + + public SAMFileHeader getFileHeader() { + if (records.size() > 0) { + return records.get(0).getHeader(); + } + return null; + } + + /** not much to do when we're fake */ + public void close() { + closed = true; + } + + /** + * are we closed? + * + * @return true if we're closed + */ + public boolean isClosed() { + return closed; + } + + /** + * get the records we've seen + * @return + */ + public List getRecords() { + return records; + } + + @Override + public void writeHeader(SAMFileHeader header) { + } + + @Override + public void setPresorted(boolean presorted) { + } + + @Override + public void setMaxRecordsInRam(int maxRecordsInRam) { + } + + /** + * @throws java.lang.UnsupportedOperationException No progress logging in this implementation. + */ + @Override + public void setProgressLogger(final ProgressLoggerInterface logger) { + throw new UnsupportedOperationException("Progress logging not supported"); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialMultiSampleReadStream.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialMultiSampleReadStream.java new file mode 100644 index 000000000..a468dc06b --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialMultiSampleReadStream.java @@ -0,0 +1,87 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.MergingSamRecordIterator; +import htsjdk.samtools.SamFileHeaderMerger; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileReader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.*; + +/** + * Simple wrapper class that multiplexes multiple ArtificialSingleSampleReadStreams into a single stream of reads + * + * @author David Roazen + */ +public class ArtificialMultiSampleReadStream implements Iterable { + + private Collection perSampleArtificialReadStreams; + private MergingSamRecordIterator mergingIterator; + + public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { + if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { + throw new ReviewedGATKException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); + } + + this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; + } + + public Iterator iterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return mergingIterator; + } + + public GATKSAMIterator getGATKSAMIterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return GATKSAMIteratorAdapter.adapt(mergingIterator); + } + + private void initialize() { + Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); + Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); + + for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { + Collection thisStreamReads = readStream.makeReads(); + + SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), + thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); + perSampleSAMReaders.add(reader); + headers.add(reader.getFileHeader()); + } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); + mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIterator.java new file mode 100644 index 000000000..e82355da9 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIterator.java @@ -0,0 +1,172 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** + * @author aaron + * + * Class ArtificialPatternedSAMIterator + * + * This class allows you to pattern the artificial sam iterator, asking for reads + * in order or out of order. + */ +public class ArtificialPatternedSAMIterator extends ArtificialSAMIterator { + + /** the pattern we're implementing */ + public enum PATTERN { + RANDOM_READS, IN_ORDER_READS; + } + + // our pattern + private final PATTERN mPattern; + + /** + * this is pretty heavy (and it could be extremely heavy, given the amount of reads they request, but it + * allows us to give them each read once, reguardless of the order specified + */ + private final int[] reads; + private final int readCount; + + /** + * create the fake iterator, given the mapping of chromosomes and read counts. If pattern + * is specified to be random, it will generate reads that are randomly placed on the current chromosome + * + * @param startingChr the starting chromosome + * @param endingChr the ending chromosome + * @param readCount the number of reads in each chromosome + * @param header the associated header + * @param pattern the pattern to implement + */ + public ArtificialPatternedSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header, PATTERN pattern ) { + super(startingChr, endingChr, readCount, unmappedReadCount, header); + mPattern = pattern; + this.readCount = readCount; + reads = new int[readCount]; + + for (int x = 0; x < readCount; x++) { + reads[x] = x+1; + } + if (pattern == PATTERN.RANDOM_READS) { + // scramble a bunch of the reads + for (int y = 0; y < readCount; y++) { + int ranOne = (int) Math.round(Math.random() * ( readCount - 1 )); + int ranTwo = (int) Math.round(Math.random() * ( readCount - 1 )); + int temp = reads[ranOne]; + reads[ranOne] = reads[ranTwo]; + reads[ranTwo] = temp; + } + /** + * up to this point there's no garauntee that the random() has made the reads out of order (though it's + * extremely extremely unlikely it's failed). Let's make sure there at least out of order: + */ + if (this.reads[0] < this.reads[reads.length - 1]) { + int temp = reads[0]; + reads[0] = reads[reads.length - 1]; + reads[reads.length - 1] = temp; + } + + } + + } + + /** + * override the default ArtificialSAMIterator createNextRead method, which creates the next read + * + * @return + */ + protected boolean createNextRead() { + if (currentRead > rCount) { + currentChromo++; + currentRead = 1; + } + // check for end condition, have we finished the chromosome listing, and have no unmapped reads + if (currentChromo >= eChromosomeCount) { + if (unmappedRemaining < 1) { + this.next = null; + return false; + } else { + ++totalReadCount; + this.next = ArtificialSAMUtils.createArtificialRead(this.header, + String.valueOf(totalReadCount), + SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, + 50); + --unmappedRemaining; + return true; + } + } + ++totalReadCount; + this.next = getNextRecord(currentRead); + + ++currentRead; + return true; + } + + + /** + * get the next read, given it's index in the chromosome + * + * @param read the read index in the chromosome + * + * @return a SAMRecord + */ + private SAMRecord getNextRecord( int read ) { + if (read > this.readCount) { + return ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(reads[readCount - 1]), currentChromo, reads[readCount - 1], 50); + } + return ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(reads[read-1]), currentChromo, reads[read-1], 50); + } + +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileReader.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMIterator.java new file mode 100644 index 000000000..e4bdfbc23 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMIterator.java @@ -0,0 +1,212 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; + +import java.util.Iterator; + + +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/** this fake iterator allows us to look at how specific piles of reads are handled */ +public class ArtificialSAMIterator implements GATKSAMIterator { + + + protected int currentChromo = 0; + protected int currentRead = 1; + protected int totalReadCount = 0; + protected int unmappedRemaining = 0; + protected boolean done = false; + // the next record + protected SAMRecord next = null; + protected SAMFileHeader header = null; + + // the passed in parameters + protected final int sChr; + protected final int eChromosomeCount; + protected final int rCount; + protected final int unmappedReadCount; + + // let us know to make a read, we need this to help out the fake sam query iterator + private boolean initialized = false; + + /** + * Is this iterator currently open or closed? Closed iterators can be reused. + */ + protected boolean open = false; + + /** + * create the fake iterator, given the mapping of chromosomes and read counts + * + * @param startingChr the starting chromosome + * @param endingChr the ending chromosome + * @param readCount the number of reads in each chromosome + * @param header the associated header + */ + ArtificialSAMIterator( int startingChr, int endingChr, int readCount, SAMFileHeader header ) { + sChr = startingChr; + eChromosomeCount = (endingChr - startingChr) + 1; + rCount = readCount; + this.header = header; + unmappedReadCount = 0; + reset(); + } + + protected void reset() { + this.currentChromo = 0; + this.currentRead = 1; + this.totalReadCount = 0; + this.done = false; + this.next = null; + this.initialized = false; + this.unmappedRemaining = unmappedReadCount; + } + + /** + * create the fake iterator, given the mapping of chromosomes and read counts + * + * @param startingChr the starting chromosome + * @param endingChr the ending chromosome + * @param readCount the number of reads in each chromosome + * @param header the associated header + */ + ArtificialSAMIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount, SAMFileHeader header ) { + sChr = startingChr; + eChromosomeCount = (endingChr - startingChr) + 1; + rCount = readCount; + this.header = header; + this.currentChromo = 0; + this.unmappedReadCount = unmappedReadCount; + reset(); + } + + public void close() { + open = false; + } + + public boolean hasNext() { + open = true; + + if (!initialized){ + initialized = true; + createNextRead(); + } + if (this.next != null) { + return true; + } + return false; + } + + protected boolean createNextRead() { + if (currentRead > rCount) { + currentChromo++; + currentRead = 1; + } + // check for end condition, have we finished the chromosome listing, and have no unmapped reads + if (currentChromo >= eChromosomeCount) { + if (unmappedRemaining < 1) { + this.next = null; + return false; + } else { + ++totalReadCount; + this.next = ArtificialSAMUtils.createArtificialRead(this.header, + String.valueOf(totalReadCount), + SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, + 50); + --unmappedRemaining; + return true; + } + } + ++totalReadCount; + this.next = ArtificialSAMUtils.createArtificialRead(this.header, String.valueOf(totalReadCount), currentChromo, currentRead, 50); + ++currentRead; + return true; + } + + + public SAMRecord next() { + open = true; + + SAMRecord ret = next; + createNextRead(); + return ret; + } + + public void remove() { + throw new UnsupportedOperationException("You've tried to remove on a GATKSAMIterator (unsupported), not to mention that this is a fake iterator."); + } + + /** + * return this iterator, for the iterable interface + * @return + */ + public Iterator iterator() { + return this; + } + + /** + * some instrumentation methods + */ + public int readsTaken() { + return totalReadCount; + } + + /** + * peek at the next sam record + * + * @return + */ + public SAMRecord peek() { + return this.next; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIterator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIterator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIterator.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java new file mode 100644 index 000000000..bad7ef643 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtils.java @@ -0,0 +1,484 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.*; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileupImpl; + +import java.io.File; +import java.util.*; + +/** + * @author aaron + * @version 1.0 + */ +public class ArtificialSAMUtils { + public static final int DEFAULT_READ_LENGTH = 50; + + /** + * create an artificial sam file + * + * @param filename the filename to write to + * @param numberOfChromosomes the number of chromosomes + * @param startingChromosome where to start counting + * @param chromosomeSize how large each chromosome is + * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) + */ + public static void createArtificialBamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { + SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); + File outFile = new File(filename); + + SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile); + + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { + for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) { + out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH)); + } + } + + out.close(); + } + + /** + * create an artificial sam file + * + * @param filename the filename to write to + * @param numberOfChromosomes the number of chromosomes + * @param startingChromosome where to start counting + * @param chromosomeSize how large each chromosome is + * @param readsPerChomosome how many reads to make in each chromosome. They'll be aligned from position 1 to x (which is the number of reads) + */ + public static void createArtificialSamFile(String filename, int numberOfChromosomes, int startingChromosome, int chromosomeSize, int readsPerChomosome) { + SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize); + File outFile = new File(filename); + + SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile); + + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { + for (int readNumber = 1; readNumber <= readsPerChomosome; readNumber++) { + out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100)); + } + } + + out.close(); + } + + /** + * Creates an artificial sam header, matching the parameters, chromosomes which will be labeled chr1, chr2, etc + * + * @param numberOfChromosomes the number of chromosomes to create + * @param startingChromosome the starting number for the chromosome (most likely set to 1) + * @param chromosomeSize the length of each chromosome + * @return + */ + public static SAMFileHeader createArtificialSamHeader(int numberOfChromosomes, int startingChromosome, int chromosomeSize) { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate); + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + // make up some sequence records + for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) { + SAMSequenceRecord rec = new SAMSequenceRecord("chr" + (x), chromosomeSize /* size */); + rec.setSequenceLength(chromosomeSize); + dict.addSequence(rec); + } + header.setSequenceDictionary(dict); + return header; + } + + /** + * Creates an artificial sam header based on the sequence dictionary dict + * + * @return a new sam header + */ + public static SAMFileHeader createArtificialSamHeader(final SAMSequenceDictionary dict) { + SAMFileHeader header = new SAMFileHeader(); + header.setSortOrder(htsjdk.samtools.SAMFileHeader.SortOrder.coordinate); + header.setSequenceDictionary(dict); + return header; + } + + /** + * Creates an artificial sam header with standard test parameters + * + * @return the sam header + */ + public static SAMFileHeader createArtificialSamHeader() { + return createArtificialSamHeader(1, 1, 1000000); + } + + /** + * setup a default read group for a SAMFileHeader + * + * @param header the header to set + * @param readGroupID the read group ID tag + * @param sampleName the sample name + * @return the adjusted SAMFileHeader + */ + public static SAMFileHeader createDefaultReadGroup(SAMFileHeader header, String readGroupID, String sampleName) { + SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID); + rec.setSample(sampleName); + List readGroups = new ArrayList(); + readGroups.add(rec); + header.setReadGroups(readGroups); + return header; + } + + /** + * setup read groups for the specified read groups and sample names + * + * @param header the header to set + * @param readGroupIDs the read group ID tags + * @param sampleNames the sample names + * @return the adjusted SAMFileHeader + */ + public static SAMFileHeader createEnumeratedReadGroups(SAMFileHeader header, List readGroupIDs, List sampleNames) { + if (readGroupIDs.size() != sampleNames.size()) { + throw new ReviewedGATKException("read group count and sample name count must be the same"); + } + + List readGroups = new ArrayList(); + + int x = 0; + for (; x < readGroupIDs.size(); x++) { + SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupIDs.get(x)); + rec.setSample(sampleNames.get(x)); + readGroups.add(rec); + } + header.setReadGroups(readGroups); + return header; + } + + + /** + * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param length the length of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, int length) { + if ((refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart != SAMRecord.NO_ALIGNMENT_START) || + (refIndex != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX && alignmentStart == SAMRecord.NO_ALIGNMENT_START)) + throw new ReviewedGATKException("Invalid alignment start for artificial read, start = " + alignmentStart); + GATKSAMRecord record = new GATKSAMRecord(header); + record.setReadName(name); + record.setReferenceIndex(refIndex); + record.setAlignmentStart(alignmentStart); + List elements = new ArrayList(); + elements.add(new CigarElement(length, CigarOperator.characterToEnum('M'))); + record.setCigar(new Cigar(elements)); + record.setProperPairFlag(false); + + // our reads and quals are all 'A's by default + byte[] c = new byte[length]; + byte[] q = new byte[length]; + for (int x = 0; x < length; x++) + c[x] = q[x] = 'A'; + record.setReadBases(c); + record.setBaseQualities(q); + + if (refIndex == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setReadUnmappedFlag(true); + } + + return record; + } + + /** + * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param bases the sequence of the read + * @param qual the qualities of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual) { + if (bases.length != qual.length) { + throw new ReviewedGATKException("Passed in read string is different length then the quality array"); + } + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); + rec.setReadBases(bases); + rec.setBaseQualities(qual); + rec.setReadGroup(new GATKSAMReadGroupRecord("x")); + if (refIndex == -1) { + rec.setReadUnmappedFlag(true); + } + + return rec; + } + + /** + * Create an artificial read based on the parameters + * + * @param header the SAM header to associate the read with + * @param name the name of the read + * @param refIndex the reference index, i.e. what chromosome to associate it with + * @param alignmentStart where to start the alignment + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual, String cigar) { + GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases, qual); + rec.setCigarString(cigar); + return rec; + } + + /** + * Create an artificial read with the following default parameters : + * header: + * numberOfChromosomes = 1 + * startingChromosome = 1 + * chromosomeSize = 1000000 + * read: + * name = "default_read" + * refIndex = 0 + * alignmentStart = 1 + * + * @param bases the sequence of the read + * @param qual the qualities of the read + * @param cigar the cigar string of the read + * @return the artificial read + */ + public static GATKSAMRecord createArtificialRead(byte[] bases, byte[] qual, String cigar) { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, qual, cigar); + } + + public static GATKSAMRecord createArtificialRead(Cigar cigar) { + int length = cigar.getReadLength(); + byte [] base = {'A'}; + byte [] qual = {30}; + byte [] bases = Utils.arrayFromArrayWithLength(base, length); + byte [] quals = Utils.arrayFromArrayWithLength(qual, length); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + return ArtificialSAMUtils.createArtificialRead(header, "default_read", 0, 10000, bases, quals, cigar.toString()); + } + + + public final static List createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { + GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); + GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); + + left.setReadPairedFlag(true); + right.setReadPairedFlag(true); + + left.setProperPairFlag(true); + right.setProperPairFlag(true); + + left.setFirstOfPairFlag(leftIsFirst); + right.setFirstOfPairFlag(!leftIsFirst); + + left.setReadNegativeStrandFlag(leftIsNegative); + left.setMateNegativeStrandFlag(!leftIsNegative); + right.setReadNegativeStrandFlag(!leftIsNegative); + right.setMateNegativeStrandFlag(leftIsNegative); + + left.setMateAlignmentStart(right.getAlignmentStart()); + right.setMateAlignmentStart(left.getAlignmentStart()); + + left.setMateReferenceIndex(0); + right.setMateReferenceIndex(0); + + int isize = rightStart + readLen - leftStart; + left.setInferredInsertSize(isize); + right.setInferredInsertSize(-isize); + + return Arrays.asList(left, right); + } + + /** + * Create a collection of identical artificial reads based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like positional downsampling where you care only about the position and + * number of reads, and not the other attributes. + * + * @param stackSize number of identical reads to create + * @param header the SAM header to associate each read with + * @param name name associated with each read + * @param refIndex the reference index, i.e. what chromosome to associate them with + * @param alignmentStart where to start each alignment + * @param length the length of each read + * + * @return a collection of stackSize reads all sharing the above properties + */ + public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + Collection stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + + /** + * create an iterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @return GATKSAMIterator representing the specified amount of fake data + */ + public static GATKSAMIterator mappedReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); + } + + /** + * create an iterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file + * @return GATKSAMIterator representing the specified amount of fake data + */ + public static GATKSAMIterator mappedAndUnmappedReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); + } + + /** + * create an ArtificialSAMQueryIterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @return GATKSAMIterator representing the specified amount of fake data + */ + public static ArtificialSAMQueryIterator queryReadIterator(int startingChr, int endingChr, int readCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header); + } + + /** + * create an ArtificialSAMQueryIterator containing the specified read piles + * + * @param startingChr the chromosome (reference ID) to start from + * @param endingChr the id to end with + * @param readCount the number of reads per chromosome + * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file + * @return GATKSAMIterator representing the specified amount of fake data + */ + public static GATKSAMIterator queryReadIterator(int startingChr, int endingChr, int readCount, int unmappedReadCount) { + SAMFileHeader header = createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + + return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); + } + + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static GATKSAMIterator createReadIterator(SAMRecord... reads) { + return createReadIterator(Arrays.asList(reads)); + } + + /** + * Create an iterator containing the specified reads + * + * @param reads the reads + * @return iterator for the reads + */ + public static GATKSAMIterator createReadIterator(List reads) { + final Iterator iter = reads.iterator(); + return new GATKSAMIterator() { + @Override public void close() {} + @Override public Iterator iterator() { return iter; } + @Override public boolean hasNext() { return iter.hasNext(); } + @Override public SAMRecord next() { return iter.next(); } + @Override public void remove() { iter.remove(); } + }; + } + + private final static int ranIntInclusive(Random ran, int start, int stop) { + final int range = stop - start; + return ran.nextInt(range) + start; + } + + /** + * Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with + * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert + * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second + * may be, depending on where this sampled insertSize puts it. + * + * @param header + * @param loc + * @param readLen + * @param insertSize + * @param pileupSize + * @return + */ + public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) { + final Random ran = new Random(); + final boolean leftIsFirst = true; + final boolean leftIsNegative = false; + final int insertSizeVariation = insertSize / 10; + final int pos = loc.getStart(); + + final List pileupElements = new ArrayList(); + for (int i = 0; i < pileupSize / 2; i++) { + final String readName = "read" + i; + final int leftStart = ranIntInclusive(ran, 1, pos); + final int fragmentSize = (int) (ran.nextGaussian() * insertSizeVariation + insertSize); + final int rightStart = leftStart + fragmentSize - readLen; + + if (rightStart <= 0) continue; + + List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + final GATKSAMRecord left = pair.get(0); + final GATKSAMRecord right = pair.get(1); + + pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(left, pos - leftStart)); + + if (pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { + pileupElements.add(LocusIteratorByState.createPileupForReadAndOffset(right, pos - rightStart)); + } + } + + Collections.sort(pileupElements); + return new ReadBackedPileupImpl(loc, pileupElements); + } +} diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStream.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStream.java new file mode 100644 index 000000000..93d27f7d5 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStream.java @@ -0,0 +1,213 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIteratorAdapter; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; + +/** + * An artificial stream of reads from a single read group/sample with configurable characteristics + * such as: + * + * -the number of contigs that the reads should be distributed across + * -number of "stacks" of reads sharing the same alignment start position per contig + * -the min/max number of reads in each stack (exact values chosen randomly from this range) + * -the min/max distance between stack start positions (exact values chosen randomly from this range) + * -the min/max length of each read (exact values chosen randomly from this range) + * -the number of unmapped reads + * + * The cigar string for all reads will be *M, where * is the length of the read. + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStream implements Iterable { + private SAMFileHeader header; + private String readGroupID; + private int numContigs; + private int numStacksPerContig; + private int minReadsPerStack; + private int maxReadsPerStack; + private int minDistanceBetweenStacks; + private int maxDistanceBetweenStacks; + private int minReadLength; + private int maxReadLength; + private int numUnmappedReads; + + private static final String READ_GROUP_TAG = "RG"; + + public ArtificialSingleSampleReadStream( SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + this.header = header; + this.readGroupID = readGroupID; + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + this.minReadLength = minReadLength; + this.maxReadLength = maxReadLength; + this.numUnmappedReads = numUnmappedReads; + + validateStreamParameters(); + } + + private void validateStreamParameters() { + if ( header == null || readGroupID == null ) { + throw new ReviewedGATKException("null SAMFileHeader or read group ID") ; + } + + if ( header.getReadGroup(readGroupID) == null ) { + throw new ReviewedGATKException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); + } + + if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || + minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || + numUnmappedReads < 0 ) { + throw new ReviewedGATKException("Read stream parameters must be >= 0"); + } + + if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { + throw new ReviewedGATKException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); + } + + if ( minReadsPerStack > maxReadsPerStack ) { + throw new ReviewedGATKException("minReadsPerStack > maxReadsPerStack"); + } + + if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { + throw new ReviewedGATKException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); + } + + if ( minReadLength > maxReadLength ) { + throw new ReviewedGATKException("minReadLength > maxReadLength"); + } + } + + public Iterator iterator() { + return makeReads().iterator(); + } + + public GATKSAMIterator getGATKSAMIterator() { + return GATKSAMIteratorAdapter.adapt(iterator()); + } + + public Collection makeReads() { + Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); + + for ( int contig = 0; contig < numContigs; contig++ ) { + int alignmentStart = 1; + + for ( int stack = 0; stack < numStacksPerContig; stack++ ) { + reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); + alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + if ( numUnmappedReads > 0 ) { + reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); + } + + return reads; + } + + private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { + Collection readStack = new ArrayList(stackSize); + + for ( int i = 0; i < stackSize; i++ ) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, + "foo", + contig, + alignmentStart, + MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); + read.setAttribute(READ_GROUP_TAG, readGroupID); + readStack.add(read); + } + + return readStack; + } + + public SAMFileHeader getHeader() { + return header; + } + + public String getReadGroupID() { + return readGroupID; + } + + public int getNumContigs() { + return numContigs; + } + + public int getNumStacksPerContig() { + return numStacksPerContig; + } + + public int getMinReadsPerStack() { + return minReadsPerStack; + } + + public int getMaxReadsPerStack() { + return maxReadsPerStack; + } + + public int getMinDistanceBetweenStacks() { + return minDistanceBetweenStacks; + } + + public int getMaxDistanceBetweenStacks() { + return maxDistanceBetweenStacks; + } + + public int getMinReadLength() { + return minReadLength; + } + + public int getMaxReadLength() { + return maxReadLength; + } + + public int getNumUnmappedReads() { + return numUnmappedReads; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/CigarUtils.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMFileWriter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMFileWriter.java new file mode 100644 index 000000000..7949dd49e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMFileWriter.java @@ -0,0 +1,56 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMFileWriter; + +/** + * A writer that will allow unsorted BAM files to be written + * and sorted on-the-fly. + * + * @author mhanna + * @version 0.1 + */ +public interface GATKSAMFileWriter extends SAMFileWriter { + /** + * Writes the given custom header to SAM file output. + * @param header The header to write. + */ + public void writeHeader(SAMFileHeader header); + + /** + * Set Whether the BAM file to create is actually presorted. + * @param presorted True if the BAM file is presorted. False otherwise. + */ + public void setPresorted(boolean presorted); + + /** + * Set how many records in RAM the BAM file stores when sorting on-the-fly. + * @param maxRecordsInRam Max number of records in RAM. + */ + public void setMaxRecordsInRam(int maxRecordsInRam); +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMReadGroupRecord.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMReadGroupRecord.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMReadGroupRecord.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMReadGroupRecord.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecord.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecordIterator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecordIterator.java new file mode 100644 index 000000000..cc2b77895 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecordIterator.java @@ -0,0 +1,58 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.Iterator; + +/** + * Temporarily hack to convert SAMRecords to GATKSAMRecords + * + * User: depristo + * Date: 1/11/13 + * Time: 1:19 PM + */ +public class GATKSAMRecordIterator implements CloseableIterator, Iterable { + final CloseableIterator it; + + public GATKSAMRecordIterator(final CloseableIterator it) { + this.it = it; + } + + public GATKSAMRecordIterator(final GATKSAMIterator it) { + this.it = it; + } + + @Override public boolean hasNext() { return it.hasNext(); } + @Override public GATKSAMRecord next() { return (GATKSAMRecord)it.next(); } + @Override public void remove() { it.remove(); } + @Override public void close() { it.close(); } + @Override public Iterator iterator() { return this; } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSamRecordFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSamRecordFactory.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSamRecordFactory.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/GATKSamRecordFactory.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUnclippedStartWithNoTiesComparator.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUnclippedStartWithNoTiesComparator.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUnclippedStartWithNoTiesComparator.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUnclippedStartWithNoTiesComparator.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java new file mode 100644 index 000000000..ce56a329f --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/ReadUtils.java @@ -0,0 +1,957 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.samtools.*; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.File; +import java.util.*; + +/** + * A miscellaneous collection of utilities for working with SAM files, headers, etc. + * Static methods only, please. + * + * @author mhanna + * @version 0.1 + */ +public class ReadUtils { + private final static Logger logger = Logger.getLogger(ReadUtils.class); + + private static final String OFFSET_OUT_OF_BOUNDS_EXCEPTION = "Offset cannot be greater than read length %d : %d"; + private static final String OFFSET_NOT_ZERO_EXCEPTION = "We ran past the end of the read and never found the offset, something went wrong!"; + + private ReadUtils() { + } + + private static final int DEFAULT_ADAPTOR_SIZE = 100; + public static final int CLIPPING_GOAL_NOT_REACHED = -1; + + /** + * Pull out the samples from a SAMFileHeader; + * note that we use a TreeSet so that they are sorted + * + * @param header the sam file header + * @return list of strings representing the sample names + */ + public static Set getSAMFileSamples(final SAMFileHeader header) { + // get all of the unique sample names + final Set samples = new TreeSet(); + List readGroups = header.getReadGroups(); + for ( SAMReadGroupRecord readGroup : readGroups ) + samples.add(readGroup.getSample()); + return samples; + } + + /** + * A marker to tell which end of the read has been clipped + */ + public enum ClippingTail { + LEFT_TAIL, + RIGHT_TAIL + } + + /** + * A HashMap of the SAM spec read flag names + * + * Note: This is not being used right now, but can be useful in the future + */ + private static final Map readFlagNames = new HashMap(); + + static { + readFlagNames.put(0x1, "Paired"); + readFlagNames.put(0x2, "Proper"); + readFlagNames.put(0x4, "Unmapped"); + readFlagNames.put(0x8, "MateUnmapped"); + readFlagNames.put(0x10, "Forward"); + //readFlagNames.put(0x20, "MateForward"); + readFlagNames.put(0x40, "FirstOfPair"); + readFlagNames.put(0x80, "SecondOfPair"); + readFlagNames.put(0x100, "NotPrimary"); + readFlagNames.put(0x200, "NON-PF"); + readFlagNames.put(0x400, "Duplicate"); + } + + /** + * This enum represents all the different ways in which a read can overlap an interval. + * + * NO_OVERLAP_CONTIG: + * read and interval are in different contigs. + * + * NO_OVERLAP_LEFT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * NO_OVERLAP_RIGHT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * OVERLAP_LEFT: + * the read starts before the beginning of the interval but ends inside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * OVERLAP_RIGHT: + * the read starts inside the interval but ends outside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * OVERLAP_LEFT_AND_RIGHT: + * the read starts before the interval and ends after the interval + * + * |-----------| (interval) + * <-------------------> (read) + * + * OVERLAP_CONTAINED: + * the read starts and ends inside the interval + * + * |----------------| (interval) + * <--------> (read) + */ + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} + + /** + * is this base inside the adaptor of the read? + * + * There are two cases to treat here: + * + * 1) Read is in the negative strand => Adaptor boundary is on the left tail + * 2) Read is in the positive strand => Adaptor boundary is on the right tail + * + * Note: We return false to all reads that are UNMAPPED or have an weird big insert size (probably due to mismapping or bigger event) + * + * @param read the read to test + * @param basePos base position in REFERENCE coordinates (not read coordinates) + * @return whether or not the base is in the adaptor + */ + public static boolean isBaseInsideAdaptor(final GATKSAMRecord read, long basePos) { + final int adaptorBoundary = read.getAdaptorBoundary(); + if (adaptorBoundary == CANNOT_COMPUTE_ADAPTOR_BOUNDARY || read.getInferredInsertSize() > DEFAULT_ADAPTOR_SIZE) + return false; + + return read.getReadNegativeStrandFlag() ? basePos <= adaptorBoundary : basePos >= adaptorBoundary; + } + + /** + * Finds the adaptor boundary around the read and returns the first base inside the adaptor that is closest to + * the read boundary. If the read is in the positive strand, this is the first base after the end of the + * fragment (Picard calls it 'insert'), if the read is in the negative strand, this is the first base before the + * beginning of the fragment. + * + * There are two cases we need to treat here: + * + * 1) Our read is in the reverse strand : + * + * <----------------------| * + * |---------------------> + * + * in these cases, the adaptor boundary is at the mate start (minus one) + * + * 2) Our read is in the forward strand : + * + * |----------------------> * + * <----------------------| + * + * in these cases the adaptor boundary is at the start of the read plus the inferred insert size (plus one) + * + * @param read the read being tested for the adaptor boundary + * @return the reference coordinate for the adaptor boundary (effectively the first base IN the adaptor, closest to the read. + * CANNOT_COMPUTE_ADAPTOR_BOUNDARY if the read is unmapped or the mate is mapped to another contig. + */ + public static int getAdaptorBoundary(final SAMRecord read) { + if ( ! hasWellDefinedFragmentSize(read) ) { + return CANNOT_COMPUTE_ADAPTOR_BOUNDARY; + } else if ( read.getReadNegativeStrandFlag() ) { + return read.getMateAlignmentStart() - 1; // case 1 (see header) + } else { + final int insertSize = Math.abs(read.getInferredInsertSize()); // the inferred insert size can be negative if the mate is mapped before the read (so we take the absolute value) + return read.getAlignmentStart() + insertSize + 1; // case 2 (see header) + } + } + + public static int CANNOT_COMPUTE_ADAPTOR_BOUNDARY = Integer.MIN_VALUE; + + /** + * Can the adaptor sequence of read be reliably removed from the read based on the alignment of + * read and its mate? + * + * @param read the read to check + * @return true if it can, false otherwise + */ + public static boolean hasWellDefinedFragmentSize(final SAMRecord read) { + if ( read.getInferredInsertSize() == 0 ) + // no adaptors in reads with mates in another chromosome or unmapped pairs + return false; + if ( ! read.getReadPairedFlag() ) + // only reads that are paired can be adaptor trimmed + return false; + if ( read.getReadUnmappedFlag() || read.getMateUnmappedFlag() ) + // only reads when both reads are mapped can be trimmed + return false; +// if ( ! read.getProperPairFlag() ) +// // note this flag isn't always set properly in BAMs, can will stop us from eliminating some proper pairs +// // reads that aren't part of a proper pair (i.e., have strange alignments) can't be trimmed +// return false; + if ( read.getReadNegativeStrandFlag() == read.getMateNegativeStrandFlag() ) + // sanity check on getProperPairFlag to ensure that read1 and read2 aren't on the same strand + return false; + + if ( read.getReadNegativeStrandFlag() ) { + // we're on the negative strand, so our read runs right to left + return read.getAlignmentEnd() > read.getMateAlignmentStart(); + } else { + // we're on the positive strand, so our mate should be to our right (his start + insert size should be past our start) + return read.getAlignmentStart() <= read.getMateAlignmentStart() + read.getInferredInsertSize(); + } + } + + /** + * is the read a 454 read? + * + * @param read the read to test + * @return checks the read group tag PL for the default 454 tag + */ + public static boolean is454Read(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.LS454; + } + + /** + * is the read an IonTorrent read? + * + * @param read the read to test + * @return checks the read group tag PL for the default ion tag + */ + public static boolean isIonRead(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.ION_TORRENT; + } + + /** + * is the read a SOLiD read? + * + * @param read the read to test + * @return checks the read group tag PL for the default SOLiD tag + */ + public static boolean isSOLiDRead(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.SOLID; + } + + /** + * is the read a SLX read? + * + * @param read the read to test + * @return checks the read group tag PL for the default SLX tag + */ + public static boolean isIlluminaRead(GATKSAMRecord read) { + return NGSPlatform.fromRead(read) == NGSPlatform.ILLUMINA; + } + + /** + * checks if the read has a platform tag in the readgroup equal to 'name'. + * Assumes that 'name' is upper-cased. + * + * @param read the read to test + * @param name the upper-cased platform name to test + * @return whether or not name == PL tag in the read group of read + */ + public static boolean isPlatformRead(GATKSAMRecord read, String name) { + + SAMReadGroupRecord readGroup = read.getReadGroup(); + if (readGroup != null) { + Object readPlatformAttr = readGroup.getAttribute("PL"); + if (readPlatformAttr != null) + return readPlatformAttr.toString().toUpperCase().contains(name); + } + return false; + } + + + /** + * Returns the collections of reads sorted in coordinate order, according to the order defined + * in the reads themselves + * + * @param reads + * @return + */ + public final static List sortReadsByCoordinate(List reads) { + final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); + Collections.sort(reads, comparer); + return reads; + } + + /** + * If a read starts in INSERTION, returns the first element length. + * + * Warning: If the read has Hard or Soft clips before the insertion this function will return 0. + * + * @param read + * @return the length of the first insertion, or 0 if there is none (see warning). + */ + public final static int getFirstInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(0); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + /** + * If a read ends in INSERTION, returns the last element length. + * + * Warning: If the read has Hard or Soft clips after the insertion this function will return 0. + * + * @param read + * @return the length of the last insertion, or 0 if there is none (see warning). + */ + public final static int getLastInsertionOffset(SAMRecord read) { + CigarElement e = read.getCigar().getCigarElement(read.getCigarLength() - 1); + if ( e.getOperator() == CigarOperator.I ) + return e.getLength(); + else + return 0; + } + + /** + * Determines what is the position of the read in relation to the interval. + * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. + * @param read the read + * @param interval the interval + * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) + */ + public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { + + int sStart = read.getSoftStart(); + int sStop = read.getSoftEnd(); + int uStart = read.getUnclippedStart(); + int uStop = read.getUnclippedEnd(); + + if ( !read.getReferenceName().equals(interval.getContig()) ) + return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; + + else if ( uStop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; + + else if ( uStart > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; + + else if ( sStop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; + + else if ( sStart > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; + + else if ( (sStart >= interval.getStart()) && + (sStop <= interval.getStop()) ) + return ReadAndIntervalOverlap.OVERLAP_CONTAINED; + + else if ( (sStart < interval.getStart()) && + (sStop > interval.getStop()) ) + return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; + + else if ( (sStart < interval.getStart()) ) + return ReadAndIntervalOverlap.OVERLAP_LEFT; + + else + return ReadAndIntervalOverlap.OVERLAP_RIGHT; + } + + /** + * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) to take care of + * two corner cases: + * + * 1. If clipping the right tail (end of the read) getReadCoordinateForReferenceCoordinate and fall inside + * a deletion return the base after the deletion. If clipping the left tail (beginning of the read) it + * doesn't matter because it already returns the previous base by default. + * + * 2. If clipping the left tail (beginning of the read) getReadCoordinateForReferenceCoordinate and the + * read starts with an insertion, and you're requesting the first read based coordinate, it will skip + * the leading insertion (because it has the same reference coordinate as the following base). + * + * @param read + * @param refCoord + * @param tail + * @return the read coordinate corresponding to the requested reference coordinate for clipping. + */ + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd() || (read.getUnclippedEnd() < read.getUnclippedStart())"}) + @Ensures({"result >= 0", "result < read.getReadLength()"}) + public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, tail, false); + } + + public static int getReadCoordinateForReferenceCoordinateUpToEndOfRead(GATKSAMRecord read, int refCoord, ClippingTail tail) { + final int leftmostSafeVariantPosition = Math.max(read.getSoftStart(), refCoord); + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), leftmostSafeVariantPosition, tail, false); + } + + public static int getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final ClippingTail tail, final boolean allowGoalNotReached) { + Pair result = getReadCoordinateForReferenceCoordinate(alignmentStart, cigar, refCoord, allowGoalNotReached); + int readCoord = result.getFirst(); + + // Corner case one: clipping the right tail and falls on deletion, move to the next + // read coordinate. It is not a problem for the left tail because the default answer + // from getReadCoordinateForReferenceCoordinate is to give the previous read coordinate. + if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) + readCoord++; + + // clipping the left tail and first base is insertion, go to the next read coordinate + // with the same reference coordinate. Advance to the next cigar element, or to the + // end of the read if there is no next element. + final CigarElement firstElementIsInsertion = readStartsWithInsertion(cigar); + if (readCoord == 0 && tail == ClippingTail.LEFT_TAIL && firstElementIsInsertion != null) + readCoord = Math.min(firstElementIsInsertion.getLength(), cigar.getReadLength() - 1); + + return readCoord; + } + + /** + * Returns the read coordinate corresponding to the requested reference coordinate. + * + * WARNING: if the requested reference coordinate happens to fall inside or just before a deletion (or skipped region) in the read, this function + * will return the last read base before the deletion (or skipped region). This function returns a + * Pair(int readCoord, boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion) so you can choose which readCoordinate to use when faced with + * a deletion (or skipped region). + * + * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a + * pre-processed result according to normal clipping needs. Or you can use this function and tailor the + * behavior to your needs. + * + * @param read + * @param refCoord the requested reference coordinate + * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) + */ + @Requires({"refCoord >= read.getSoftStart()", "refCoord <= read.getSoftEnd()"}) + @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) + //TODO since we do not have contracts any more, should we check for the requirements in the method code? + public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { + return getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refCoord, false); + } + + public static Pair getReadCoordinateForReferenceCoordinate(final int alignmentStart, final Cigar cigar, final int refCoord, final boolean allowGoalNotReached) { + int readBases = 0; + int refBases = 0; + boolean fallsInsideDeletionOrSkippedRegion = false; + boolean endJustBeforeDeletionOrSkippedRegion = false; + boolean fallsInsideOrJustBeforeDeletionOrSkippedRegion = false; + + final int goal = refCoord - alignmentStart; // The goal is to move this many reference bases + if (goal < 0) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedGATKException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + } + } + boolean goalReached = refBases == goal; + + Iterator cigarElementIterator = cigar.getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + final CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (refBases + cigarElement.getLength() < goal) + shift = cigarElement.getLength(); + else + shift = goal - refBases; + + refBases += shift; + } + goalReached = refBases == goal; + + if (!goalReached && cigarElement.getOperator().consumesReadBases()) + readBases += cigarElement.getLength(); + + if (goalReached) { + // Is this base's reference position within this cigar element? Or did we use it all? + final boolean endsWithinCigar = shift < cigarElement.getLength(); + + // If it isn't, we need to check the next one. There should *ALWAYS* be a next one + // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + if (!endsWithinCigar && !cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedGATKException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); + } + } + + CigarElement nextCigarElement = null; + + // if we end inside the current cigar element, we just have to check if it is a deletion (or skipped region) + if (endsWithinCigar) + fallsInsideDeletionOrSkippedRegion = (cigarElement.getOperator() == CigarOperator.DELETION || cigarElement.getOperator() == CigarOperator.SKIPPED_REGION) ; + + // if we end outside the current cigar element, we need to check if the next element is an insertion, deletion or skipped region. + else { + nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedGATKException(String.format("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- check read with alignment start: %s and cigar: %s", alignmentStart, cigar)); + } + } + + nextCigarElement = cigarElementIterator.next(); + } + + // if it's a deletion (or skipped region), we will pass the information on to be handled downstream. + endJustBeforeDeletionOrSkippedRegion = (nextCigarElement.getOperator() == CigarOperator.DELETION || nextCigarElement.getOperator() == CigarOperator.SKIPPED_REGION); + } + + fallsInsideOrJustBeforeDeletionOrSkippedRegion = endJustBeforeDeletionOrSkippedRegion || fallsInsideDeletionOrSkippedRegion; + + // If we reached our goal outside a deletion (or skipped region), add the shift + if (!fallsInsideOrJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) + readBases += shift; + + // If we reached our goal just before a deletion (or skipped region) we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (or skipped region) (see warning in function contracts) + else if (endJustBeforeDeletionOrSkippedRegion && cigarElement.getOperator().consumesReadBases()) + readBases += shift - 1; + + // If we reached our goal inside a deletion (or skipped region), or just between a deletion and a skipped region, + // then we must backtrack to the last base before the deletion (or skipped region) + else if (fallsInsideDeletionOrSkippedRegion || + (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.N)) || + (endJustBeforeDeletionOrSkippedRegion && nextCigarElement.getOperator().equals(CigarOperator.D))) + readBases--; + } + } + + if (!goalReached) { + if (allowGoalNotReached) { + return new Pair(CLIPPING_GOAL_NOT_REACHED, false); + } else { + throw new ReviewedGATKException("Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); + } + } + + return new Pair(readBases, fallsInsideOrJustBeforeDeletionOrSkippedRegion); + } + + /** + * Compares two SAMRecords only the basis on alignment start. Note that + * comparisons are performed ONLY on the basis of alignment start; any + * two SAM records with the same alignment start will be considered equal. + * + * Unmapped alignments will all be considered equal. + */ + + @Requires({"read1 != null", "read2 != null"}) + public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { + AlignmentStartComparator comp = new AlignmentStartComparator(); + return comp.compare(read1, read2); + } + + /** + * Is a base inside a read? + * + * @param read the read to evaluate + * @param referenceCoordinate the reference coordinate of the base to test + * @return true if it is inside the read, false otherwise. + */ + public static boolean isInsideRead(final GATKSAMRecord read, final int referenceCoordinate) { + return referenceCoordinate >= read.getAlignmentStart() && referenceCoordinate <= read.getAlignmentEnd(); + } + + /** + * Is this read all insertion? + * + * @param read + * @return whether or not the only element in the cigar string is an Insertion + */ + public static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } + + /** + * @see #readStartsWithInsertion(htsjdk.samtools.Cigar, boolean) with ignoreClipOps set to true + */ + public static CigarElement readStartsWithInsertion(final Cigar cigarForRead) { + return readStartsWithInsertion(cigarForRead, true); + } + + /** + * Checks if a read starts with an insertion. + * + * @param cigarForRead the CIGAR to evaluate + * @param ignoreSoftClipOps should we ignore S operators when evaluating whether an I operator is at the beginning? Note that H operators are always ignored. + * @return the element if it's a leading insertion or null otherwise + */ + public static CigarElement readStartsWithInsertion(final Cigar cigarForRead, final boolean ignoreSoftClipOps) { + for ( final CigarElement cigarElement : cigarForRead.getCigarElements() ) { + if ( cigarElement.getOperator() == CigarOperator.INSERTION ) + return cigarElement; + + else if ( cigarElement.getOperator() != CigarOperator.HARD_CLIP && ( !ignoreSoftClipOps || cigarElement.getOperator() != CigarOperator.SOFT_CLIP) ) + break; + } + return null; + } + + /** + * Returns the coverage distribution of a list of reads within the desired region. + * + * See getCoverageDistributionOfRead for information on how the coverage is calculated. + * + * @param list the list of reads covering the region + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfReads(List list, int startLocation, int stopLocation) { + int [] totalCoverage = new int[stopLocation - startLocation + 1]; + + for (GATKSAMRecord read : list) { + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + totalCoverage = MathUtils.addArrays(totalCoverage, readCoverage); + } + + return totalCoverage; + } + + /** + * Returns the coverage distribution of a single read within the desired region. + * + * Note: This function counts DELETIONS as coverage (since the main purpose is to downsample + * reads for variant regions, and deletions count as variants) + * + * @param read the read to get the coverage distribution of + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return an array with the coverage of each position from startLocation to stopLocation + */ + public static int [] getCoverageDistributionOfRead(GATKSAMRecord read, int startLocation, int stopLocation) { + int [] coverage = new int[stopLocation - startLocation + 1]; + int refLocation = read.getSoftStart(); + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + switch (cigarElement.getOperator()) { + case S: + case M: + case EQ: + case N: + case X: + case D: + for (int i = 0; i < cigarElement.getLength(); i++) { + if (refLocation >= startLocation && refLocation <= stopLocation) { + coverage[refLocation - startLocation]++; + } + refLocation++; + } + break; + + case P: + case I: + case H: + break; + } + + if (refLocation > stopLocation) + break; + } + return coverage; + } + + /** + * Makes association maps for the reads and loci coverage as described below : + * + * - First: locusToReadMap -- a HashMap that describes for each locus, which reads contribute to its coverage. + * Note: Locus is in reference coordinates. + * Example: Locus => {read1, read2, ..., readN} + * + * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. + * Example: Read => {true, true, false, ... false} + * + * @param readList the list of reads to generate the association mappings + * @param startLocation the first reference coordinate of the region (inclusive) + * @param stopLocation the last reference coordinate of the region (inclusive) + * @return the two hashmaps described above + */ + public static Pair> , HashMap> getBothReadToLociMappings (List readList, int startLocation, int stopLocation) { + int arraySize = stopLocation - startLocation + 1; + + HashMap> locusToReadMap = new HashMap>(2*(stopLocation - startLocation + 1), 0.5f); + HashMap readToLocusMap = new HashMap(2*readList.size(), 0.5f); + + for (int i = startLocation; i <= stopLocation; i++) + locusToReadMap.put(i, new HashSet()); // Initialize the locusToRead map with empty lists + + for (GATKSAMRecord read : readList) { + readToLocusMap.put(read, new Boolean[arraySize]); // Initialize the readToLocus map with empty arrays + + int [] readCoverage = getCoverageDistributionOfRead(read, startLocation, stopLocation); + + for (int i = 0; i < readCoverage.length; i++) { + int refLocation = i + startLocation; + if (readCoverage[i] > 0) { + // Update the hash for this locus + HashSet readSet = locusToReadMap.get(refLocation); + readSet.add(read); + + // Add this locus to the read hash + readToLocusMap.get(read)[refLocation - startLocation] = true; + } + else + // Update the boolean array with a 'no coverage' from this read to this locus + readToLocusMap.get(read)[refLocation-startLocation] = false; + } + } + return new Pair>, HashMap>(locusToReadMap, readToLocusMap); + } + + /** + * Create random read qualities + * + * @param length the length of the read + * @return an array with randomized base qualities between 0 and 50 + */ + public static byte[] createRandomReadQuals(int length) { + Random random = Utils.getRandomGenerator(); + byte[] quals = new byte[length]; + for (int i = 0; i < length; i++) + quals[i] = (byte) random.nextInt(50); + return quals; + } + + /** + * Create random read qualities + * + * @param length the length of the read + * @param allowNs whether or not to allow N's in the read + * @return an array with randomized bases (A-N) with equal probability + */ + public static byte[] createRandomReadBases(int length, boolean allowNs) { + Random random = Utils.getRandomGenerator(); + int numberOfBases = allowNs ? 5 : 4; + byte[] bases = new byte[length]; + for (int i = 0; i < length; i++) { + switch (random.nextInt(numberOfBases)) { + case 0: + bases[i] = 'A'; + break; + case 1: + bases[i] = 'C'; + break; + case 2: + bases[i] = 'G'; + break; + case 3: + bases[i] = 'T'; + break; + case 4: + bases[i] = 'N'; + break; + default: + throw new ReviewedGATKException("Something went wrong, this is just impossible"); + } + } + return bases; + } + + public static GATKSAMRecord createRandomRead(int length) { + return createRandomRead(length, true); + } + + public static GATKSAMRecord createRandomRead(int length, boolean allowNs) { + byte[] quals = ReadUtils.createRandomReadQuals(length); + byte[] bbases = ReadUtils.createRandomReadBases(length, allowNs); + return ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + } + + + public static String prettyPrintSequenceRecords ( SAMSequenceDictionary sequenceDictionary ) { + String[] sequenceRecordNames = new String[sequenceDictionary.size()]; + int sequenceRecordIndex = 0; + for (SAMSequenceRecord sequenceRecord : sequenceDictionary.getSequences()) + sequenceRecordNames[sequenceRecordIndex++] = sequenceRecord.getSequenceName(); + return Arrays.deepToString(sequenceRecordNames); + } + + /** + * Calculates the reference coordinate for a read coordinate + * + * @param read the read + * @param offset the base in the read (coordinate in the read) + * @return the reference coordinate correspondent to this base + */ + public static long getReferenceCoordinateForReadCoordinate(GATKSAMRecord read, int offset) { + if (offset > read.getReadLength()) + throw new ReviewedGATKException(String.format(OFFSET_OUT_OF_BOUNDS_EXCEPTION, offset, read.getReadLength())); + + long location = read.getAlignmentStart(); + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (offset > 0 && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + long move = 0; + if (cigarElement.getOperator().consumesReferenceBases()) + move = (long) Math.min(cigarElement.getLength(), offset); + location += move; + offset -= move; + } + if (offset > 0 && !cigarElementIterator.hasNext()) + throw new ReviewedGATKException(OFFSET_NOT_ZERO_EXCEPTION); + + return location; + } + + /** + * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened. + * + * Example: + * D -> 2, 34, 75 + * I -> 55 + * S -> 0, 101 + * H -> 101 + * + * @param read the read + * @return a map with the properties described above. See example + */ + public static Map> getCigarOperatorForAllBases (GATKSAMRecord read) { + Map> events = new HashMap>(); + + int position = 0; + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + CigarOperator op = cigarElement.getOperator(); + if (op.consumesReadBases()) { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + for (int i = position; i < cigarElement.getLength(); i++) + list.add(position++); + } + else { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + list.add(position); + } + } + return events; + } + + /** + * Given a read, outputs the read bases in a string format + * + * @param read the read + * @return a string representation of the read bases + */ + public static String convertReadBasesToString(GATKSAMRecord read) { + String bases = ""; + for (byte b : read.getReadBases()) { + bases += (char) b; + } + return bases.toUpperCase(); + } + + /** + * Given a read, outputs the base qualities in a string format + * + * @param quals the read qualities + * @return a string representation of the base qualities + */ + public static String convertReadQualToString(byte[] quals) { + String result = ""; + for (byte b : quals) { + result += (char) (33 + b); + } + return result; + } + + /** + * Given a read, outputs the base qualities in a string format + * + * @param read the read + * @return a string representation of the base qualities + */ + public static String convertReadQualToString(GATKSAMRecord read) { + return convertReadQualToString(read.getBaseQualities()); + } + + /** + * Returns the reverse complement of the read bases + * + * @param bases the read bases + * @return the reverse complement of the read bases + */ + public static String getBasesReverseComplement(byte[] bases) { + String reverse = ""; + for (int i = bases.length-1; i >=0; i--) { + reverse += (char) BaseUtils.getComplement(bases[i]); + } + return reverse; + } + + /** + * Returns the reverse complement of the read bases + * + * @param read the read + * @return the reverse complement of the read bases + */ + public static String getBasesReverseComplement(GATKSAMRecord read) { + return getBasesReverseComplement(read.getReadBases()); + } + + /** + * Calculate the maximum read length from the given list of reads. + * @param reads list of reads + * @return non-negative integer + */ + @Ensures({"result >= 0"}) + public static int getMaxReadLength( final List reads ) { + if( reads == null ) { throw new IllegalArgumentException("Attempting to check a null list of reads."); } + + int maxReadLength = 0; + for( final GATKSAMRecord read : reads ) { + maxReadLength = Math.max(maxReadLength, read.getReadLength()); + } + return maxReadLength; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/SAMFileReaderBuilder.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/SAMFileReaderBuilder.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/SAMFileReaderBuilder.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/SAMFileReaderBuilder.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/SAMReaderID.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/SAMReaderID.java new file mode 100644 index 000000000..4b93c3b7e --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/SAMReaderID.java @@ -0,0 +1,134 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import org.broadinstitute.gatk.utils.commandline.Tags; + +import java.io.File; + +/** + * Uniquely identifies a SAM file reader. + * + * @author mhanna + * @version 0.1 + */ +public class SAMReaderID implements Comparable { + /** + * The SAM file at the heart of this reader. SAMReaderID + * currently supports only file-based readers. + */ + private final File samFile; + + /** + * A list of tags associated with this BAM file. + */ + private final Tags tags; + + /** + * Creates an identifier for a SAM file based on read. + * @param samFile The source file for SAM data. + * @param tags tags to use when creating a reader ID. + */ + public SAMReaderID(File samFile, Tags tags) { + this.samFile = samFile; + this.tags = tags; + } + + /** + * Creates an identifier for a SAM file based on read. + * @param samFileName The source filename for SAM data. + * @param tags tags to use when creating a reader ID. + */ + public SAMReaderID(String samFileName, Tags tags) { + this(new File(samFileName),tags); + } + + /** + * Gets the absolute pathname of this SAM file + * @return The absolute pathname of this reader's SAM file, + * or null if this reader has no associated SAM file + */ + public String getSamFilePath() { + if ( samFile == null ) { + return null; + } + + return samFile.getAbsolutePath(); + } + + /** + * Gets the SAM file at the heart of this reader. SAMReaderID + * currently supports only file-based readers. + * @return the SAM file at the heart of this reader. + */ + public File getSamFile() { + return samFile; + } + + /** + * Gets the tags associated with the given BAM file. + * @return A collection of the tags associated with this file. + */ + public Tags getTags() { + return tags; + } + + /** + * Compare two IDs to see whether they're equal. + * @param other The other identifier. + * @return True iff the two readers point to the same file. + */ + @Override + public boolean equals(Object other) { + if(other == null) return false; + if(!(other instanceof SAMReaderID)) return false; + + SAMReaderID otherID = (SAMReaderID)other; + return this.getSamFilePath().equals(otherID.getSamFilePath()); + } + + /** + * Generate a hash code for this object. + * @return A hash code, based solely on the file name at this point. + */ + @Override + public int hashCode() { + return samFile.getAbsolutePath().hashCode(); + } + + /** + * Best string representation for a SAM file reader is the path of the source file. + */ + @Override + public String toString() { + return getSamFilePath(); + } + + @Override + public int compareTo(Object other) { + return this.samFile.getAbsolutePath().compareTo(((SAMReaderID)other).samFile.getAbsolutePath()); + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/SimplifyingSAMFileWriter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/SimplifyingSAMFileWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/SimplifyingSAMFileWriter.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/SimplifyingSAMFileWriter.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/package-info.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/sam/package-info.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/sam/package-info.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/GlobalEdgeGreedySWPairwiseAlignment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/Parameters.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/Parameters.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/Parameters.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/Parameters.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignment.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignment.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignment.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignment.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignmentMain.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignmentMain.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignmentMain.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWPairwiseAlignmentMain.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWParameterSet.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWParameterSet.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWParameterSet.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SWParameterSet.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWaterman.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWaterman.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWaterman.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWaterman.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java new file mode 100644 index 000000000..33a4b7d63 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/ListFileUtils.java @@ -0,0 +1,344 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.text; + +import org.broadinstitute.gatk.utils.commandline.ParsingEngine; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.utils.refdata.utils.RMDTriplet; +import org.broadinstitute.gatk.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.*; +import java.util.regex.Pattern; + +/** + * A collection of convenience methods for working with list files. + */ +public class ListFileUtils { + /** + * Lines starting with this String in .list files are considered comments. + */ + public static final String LIST_FILE_COMMENT_START = "#"; + + /** + * Unpack the bam files to be processed, given a list of files. That list of files can + * itself contain entries which are lists of other files to be read (note: you cannot have lists + * of lists of lists). Lines in .list files containing only whitespace or which begin with + * LIST_FILE_COMMENT_START are ignored. + * + * @param samFiles The sam files, in string format. + * @param parser Parser + * @return a flattened list of the bam files provided + */ + public static List unpackBAMFileList(final List samFiles, final ParsingEngine parser) { + List unpackedReads = new ArrayList(); + for( String inputFileName: samFiles ) { + Tags inputFileNameTags = parser.getTags(inputFileName); + inputFileName = expandFileName(inputFileName); + if (inputFileName.toLowerCase().endsWith(".list") ) { + try { + for ( String fileName : new XReadLines(new File(inputFileName), true, LIST_FILE_COMMENT_START) ) { + unpackedReads.add(new SAMReaderID(fileName,parser.getTags(inputFileName))); + } + } + catch( FileNotFoundException ex ) { + throw new UserException.CouldNotReadInputFile(new File(inputFileName), "Unable to find file while unpacking reads", ex); + } + } + else if(inputFileName.toLowerCase().endsWith(".bam")) { + unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); + } + else if(inputFileName.endsWith("stdin")) { + unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); + } + else { + throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM files with the .bam extension and lists of BAM files " + + "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + + "of BAM files is in the correct format, update the extension, and try again.",inputFileName)); + } + } + return unpackedReads; + } + + /** + * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. + * @param RODBindings a text equivale + * @param parser Parser + * @return a list of expanded, bound RODs. + */ + @Deprecated + @SuppressWarnings("unused") // TODO: Who is still using this? External walkers? + public static Collection unpackRODBindingsOldStyle(final Collection RODBindings, final ParsingEngine parser) { + // todo -- this is a strange home for this code. Move into ROD system + Collection rodBindings = new ArrayList(); + + for (String fileName: RODBindings) { + final Tags tags = parser.getTags(fileName); + fileName = expandFileName(fileName); + + List positionalTags = tags.getPositionalTags(); + if(positionalTags.size() != 2) + throw new UserException("Invalid syntax for -B (reference-ordered data) input flag. " + + "Please use the following syntax when providing reference-ordered " + + "data: -B:, ."); + // Assume that if tags are present, those tags are name and type. + // Name is always first, followed by type. + String name = positionalTags.get(0); + String type = positionalTags.get(1); + + RMDTriplet.RMDStorageType storageType; + if(tags.getValue("storage") != null) + storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,tags.getValue("storage")); + else if(fileName.toLowerCase().endsWith("stdin")) + storageType = RMDTriplet.RMDStorageType.STREAM; + else + storageType = RMDTriplet.RMDStorageType.FILE; + + rodBindings.add(new RMDTriplet(name,type,fileName,storageType,tags)); + } + + return rodBindings; + } + + /** + * Convert command-line argument representation of ROD bindings to something more easily understandable by the engine. + * @param RODBindings a text equivale + * @param parser Parser + * @return a list of expanded, bound RODs. + */ + @SuppressWarnings("unchecked") + public static Collection unpackRODBindings(final Collection RODBindings, @SuppressWarnings("unused") final ParsingEngine parser) { + // todo -- this is a strange home for this code. Move into ROD system + Collection rodBindings = new ArrayList(); + FeatureManager builderForValidation = new FeatureManager(); + + for (RodBinding rodBinding: RODBindings) { + String argValue = rodBinding.getSource(); + String fileName = expandFileName(argValue); + String name = rodBinding.getName(); + String type = rodBinding.getTribbleType(); + + RMDTriplet.RMDStorageType storageType; + if(rodBinding.getTags().getValue("storage") != null) + storageType = Enum.valueOf(RMDTriplet.RMDStorageType.class,rodBinding.getTags().getValue("storage")); + else if(fileName.toLowerCase().endsWith("stdin")) + storageType = RMDTriplet.RMDStorageType.STREAM; + else + storageType = RMDTriplet.RMDStorageType.FILE; + + RMDTriplet triplet = new RMDTriplet(name,type,fileName,storageType,rodBinding.getTags()); + + // validate triplet type + FeatureManager.FeatureDescriptor descriptor = builderForValidation.getByTriplet(triplet); + if ( descriptor == null ) + throw new UserException.UnknownTribbleType(rodBinding.getTribbleType(), + String.format("Field %s had provided type %s but there's no such Tribble type. The compatible types are: %n%s", + rodBinding.getName(), rodBinding.getTribbleType(), builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); + if ( ! rodBinding.getType().isAssignableFrom(descriptor.getFeatureClass()) ) + throw new UserException.BadArgumentValue(rodBinding.getName(), + String.format("Field %s expects Features of type %s, but the input file produces Features of type %s. The compatible types are: %n%s", + rodBinding.getName(), rodBinding.getType().getSimpleName(), descriptor.getSimpleFeatureName(), + builderForValidation.userFriendlyListOfAvailableFeatures(rodBinding.getType()))); + + + rodBindings.add(triplet); + } + + return rodBindings; + } + + /** + * Expand any special characters that appear in the filename. Right now, '-' is expanded to + * '/dev/stdin' only, but in the future, special characters like '~' and '*' that are passed + * directly to the command line in some circumstances could be expanded as well. Be careful + * when adding UNIX-isms. + * @param argument the text appearing on the command-line. + * @return An expanded string suitable for opening by Java/UNIX file handling utilities. + */ + private static String expandFileName(String argument) { + if(argument.trim().equals("-")) + return "/dev/stdin"; + return argument; + } + + /** + * Returns a new set of values, containing a final set of values expanded from values + *

+ * Each element E of values can either be a literal string or a file ending in .list. + * For each E ending in .list we try to read a file named E from disk, and if possible + * all lines from that file are expanded into unique values. + * + * @param values Original values + * @return entries from values or the files listed in values + */ + public static Set unpackSet(Collection values) { + if (values == null) + throw new NullPointerException("values cannot be null"); + Set unpackedValues = new LinkedHashSet(); + // Let's first go through the list and see if we were given any files. + // We'll add every entry in the file to our set, and treat the entries as + // if they had been specified on the command line. + for (String value : values) { + File file = new File(value); + if (value.toLowerCase().endsWith(".list") && file.exists()) { + try { + unpackedValues.addAll(new XReadLines(file, true, LIST_FILE_COMMENT_START).readLines()); + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + } else { + unpackedValues.add(value); + } + } + return unpackedValues; + } + + /** + * Returns a new set of values including only values listed by filters + *

+ * Each element E of values can either be a literal string or a file. For each E, + * we try to read a file named E from disk, and if possible all lines from that file are expanded + * into unique names. + *

+ * Filters may also be a file of filters. + * + * @param values Values or files with values + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values or the files listed in values, filtered by filters + */ + public static Set includeMatching(Collection values, Collection filters, boolean exactMatch) { + return includeMatching(values, IDENTITY_STRING_CONVERTER, filters, exactMatch); + } + + /** + * Converts a type T to a String representation. + * + * @param Type to convert to a String. + */ + public static interface StringConverter { + String convert(T value); + } + + /** + * Returns a new set of values including only values matching filters + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values including only values matching filters + */ + public static Set includeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.add(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.add(value); + } + } + return filteredValues; + } + + /** + * Returns a new set of values excluding any values matching filters. + *

+ * Filters may also be a file of filters. + *

+ * The converter should convert T to a unique String for each value in the set. + * + * @param values Values or files with values + * @param converter Converts values to strings + * @param filters Filters or files with filters + * @param exactMatch If true match filters exactly, otherwise use as both exact and regular expressions + * @return entries from values exluding any values matching filters + */ + public static Set excludeMatching(Collection values, StringConverter converter, Collection filters, boolean exactMatch) { + if (values == null) + throw new NullPointerException("values cannot be null"); + if (converter == null) + throw new NullPointerException("converter cannot be null"); + if (filters == null) + throw new NullPointerException("filters cannot be null"); + + Set unpackedFilters = unpackSet(filters); + Set filteredValues = new LinkedHashSet(); + filteredValues.addAll(values); + Collection patterns = null; + if (!exactMatch) + patterns = compilePatterns(unpackedFilters); + for (T value : values) { + String converted = converter.convert(value); + if (unpackedFilters.contains(converted)) { + filteredValues.remove(value); + } else if (!exactMatch) { + for (Pattern pattern : patterns) + if (pattern.matcher(converted).find()) + filteredValues.remove(value); + } + } + return filteredValues; + } + + private static Collection compilePatterns(Collection filters) { + Collection patterns = new ArrayList(); + for (String filter: filters) { + patterns.add(Pattern.compile(filter)); + } + return patterns; + } + + protected static final StringConverter IDENTITY_STRING_CONVERTER = new StringConverter() { + @Override + public String convert(String value) { + return value; + } + }; +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/text/TextFormattingUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/TextFormattingUtils.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/text/TextFormattingUtils.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/TextFormattingUtils.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/text/XReadLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/XReadLines.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/text/XReadLines.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/text/XReadLines.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactory.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactory.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactory.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/NamedThreadFactory.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/NamedThreadFactory.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/NamedThreadFactory.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/NamedThreadFactory.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadEfficiencyMonitor.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadEfficiencyMonitor.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadEfficiencyMonitor.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadEfficiencyMonitor.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadLocalArray.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadLocalArray.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadLocalArray.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadLocalArray.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitor.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitor.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitor.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitor.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/package-info.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/package-info.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/threading/package-info.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/threading/package-info.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/ChromosomeCountConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/ChromosomeCountConstants.java new file mode 100644 index 000000000..72ed4e4f6 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/ChromosomeCountConstants.java @@ -0,0 +1,44 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.variant; + +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFStandardHeaderLines; + + +/** + * Keys and descriptions for the common chromosome count annotations + */ +public class ChromosomeCountConstants { + + public static final String[] keyNames = { VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_FREQUENCY_KEY }; + + public static final VCFInfoHeaderLine[] descriptions = { + VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY), + VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY), + VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY) }; +} \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFIndexType.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFIndexType.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFIndexType.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFIndexType.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java new file mode 100644 index 000000000..88f689b2d --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtils.java @@ -0,0 +1,1960 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.variant; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.tribble.TribbleException; +import htsjdk.tribble.util.popgen.HardyWeinbergCalculation; +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.vcf.VCFConstants; +import org.apache.commons.lang.ArrayUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.collections.Pair; + +import java.io.Serializable; +import java.util.*; + +public class GATKVariantContextUtils { + + private static Logger logger = Logger.getLogger(GATKVariantContextUtils.class); + + public static final int DEFAULT_PLOIDY = HomoSapiensConstants.DEFAULT_PLOIDY; + + public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + + /** + * Diploid NO_CALL allele list... + * + * @deprecated you should use {@link #noCallAlleles(int)} instead. It indicates the presence of a hardcoded diploid assumption which is bad. + */ + @Deprecated + public final static List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + public final static String NON_REF_SYMBOLIC_ALLELE_NAME = "NON_REF"; + public final static Allele NON_REF_SYMBOLIC_ALLELE = Allele.create("<"+NON_REF_SYMBOLIC_ALLELE_NAME+">", false); // represents any possible non-ref allele at this site + + public final static String MERGE_FILTER_PREFIX = "filterIn"; + public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; + public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; + public final static String MERGE_INTERSECTION = "Intersection"; + + /** + * Checks whether a variant-context overlaps with a region. + * + *

+ * No event overlaps an unmapped region. + *

+ * + * @param variantContext variant-context to test the overlap with. + * @param region region to test the overlap with. + * + * @throws IllegalArgumentException if either region or event is {@code null}. + * + * @return {@code true} if there is an overlap between the event described and the active region provided. + */ + public static boolean overlapsRegion(final VariantContext variantContext, final GenomeLoc region) { + if (region == null) throw new IllegalArgumentException("the active region provided cannot be null"); + if (variantContext == null) throw new IllegalArgumentException("the variant context provided cannot be null"); + if (region.isUnmapped()) + return false; + if (variantContext.getEnd() < region.getStart()) + return false; + if (variantContext.getStart() > region.getStop()) + return false; + if (!variantContext.getChr().equals(region.getContig())) + return false; + return true; + } + + /** + * Returns a homozygous call allele list given the only allele and the ploidy. + * + * @param allele the only allele in the allele list. + * @param ploidy the ploidy of the resulting allele list. + * + * @throws IllegalArgumentException if {@code allele} is {@code null} or ploidy is negative. + * + * @return never {@code null}. + */ + public static List homozygousAlleleList(final Allele allele, final int ploidy) { + if (allele == null || ploidy < 0) + throw new IllegalArgumentException(); + + // Use a tailored inner class to implement the list: + return Collections.nCopies(ploidy,allele); + } + + private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + final Iterator it1 = alleleSet1.iterator(); + final Iterator it2 = alleleSet2.iterator(); + + while ( it1.hasNext() && it2.hasNext() ) { + final Allele a1 = it1.next(); + final Allele a2 = it2.next(); + if ( ! a1.equals(a2) ) + return true; + } + + // by this point, at least one of the iterators is empty. All of the elements + // we've compared are equal up until this point. But it's possible that the + // sets aren't the same size, which is indicated by the test below. If they + // are of the same size, though, the sets are compatible + return it1.hasNext() || it2.hasNext(); + } + + /** + * Determines the common reference allele + * + * @param VCs the list of VariantContexts + * @param loc if not null, ignore records that do not begin at this start location + * @return possibly null Allele + */ + public static Allele determineReferenceAllele(final List VCs, final GenomeLoc loc) { + Allele ref = null; + + for ( final VariantContext vc : VCs ) { + if ( contextMatchesLoc(vc, loc) ) { + final Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + } + } + + return ref; + } + + /** + * Calculates the total ploidy of a variant context as the sum of all plodies across genotypes. + * @param vc the target variant context. + * @param defaultPloidy the default ploidy to be assume when there is no ploidy information for a genotype. + * @return never {@code null}. + */ + public static int totalPloidy(final VariantContext vc, final int defaultPloidy) { + if (vc == null) + throw new IllegalArgumentException("the vc provided cannot be null"); + if (defaultPloidy < 0) + throw new IllegalArgumentException("the default ploidy must 0 or greater"); + int result = 0; + for (final Genotype genotype : vc.getGenotypes()) { + final int declaredPloidy = genotype.getPloidy(); + result += declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; + } + + return result; + } + + public enum GenotypeMergeType { + /** + * Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD. + */ + UNIQUIFY, + /** + * Take genotypes in priority order (see the priority argument). + */ + PRIORITIZE, + /** + * Take the genotypes in any order. + */ + UNSORTED, + /** + * Require that all samples/genotypes be unique between all inputs. + */ + REQUIRE_UNIQUE + } + + public enum FilteredRecordMergeType { + /** + * Union - leaves the record if any record is unfiltered. + */ + KEEP_IF_ANY_UNFILTERED, + /** + * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. + */ + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL + } + + public enum MultipleAllelesMergeType { + /** + * Combine only alleles of the same type (SNP, indel, etc.) into a single VCF record. + */ + BY_TYPE, + /** + * Merge all allele types at the same start position into the same VCF record. + */ + MIX_TYPES + } + + /** + * Refactored out of the AverageAltAlleleLength annotation class + * @param vc the variant context + * @return the average length of the alt allele (a double) + */ + public static double getMeanAltAlleleLength(VariantContext vc) { + double averageLength = 1.0; + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( final Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + averageLength = ( (double) averageLengthNum )/averageLengthDenom; + } + + return averageLength; + } + + /** + * create a genome location, given a variant context + * @param genomeLocParser parser + * @param vc the variant context + * @return the genomeLoc + */ + public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { + return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); + } + + public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { + if (!context.isSNP() || !context.isBiallelic()) + throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); + return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); + } + + /** + * If this is a BiAllelic SNP, is it a transition? + */ + public static boolean isTransition(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + /** + * If this is a BiAllelic SNP, is it a transversion? + */ + public static boolean isTransversion(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + public static boolean isTransition(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + public static boolean isTransversion(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + /** + * Returns a context identical to this with the REF and ALT alleles reverse complemented. + * + * @param vc variant context + * @return new vc + */ + public static VariantContext reverseComplement(VariantContext vc) { + // create a mapping from original allele to reverse complemented allele + HashMap alleleMap = new HashMap<>(vc.getAlleles().size()); + for ( final Allele originalAllele : vc.getAlleles() ) { + Allele newAllele; + if ( originalAllele.isNoCall() ) + newAllele = originalAllele; + else + newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); + alleleMap.put(originalAllele, newAllele); + } + + // create new Genotype objects + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + List newAlleles = new ArrayList<>(); + for ( final Allele allele : genotype.getAlleles() ) { + Allele newAllele = alleleMap.get(allele); + if ( newAllele == null ) + newAllele = Allele.NO_CALL; + newAlleles.add(newAllele); + } + newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); + } + + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); + } + + /** + * Returns true iff VC is an non-complex indel where every allele represents an expansion or + * contraction of a series of identical bases in the reference. + * + * For example, suppose the ref bases are CTCTCTGA, which includes a 3x repeat of CTCTCT + * + * If VC = -/CT, then this function returns true because the CT insertion matches exactly the + * upcoming reference. + * If VC = -/CTA then this function returns false because the CTA isn't a perfect match + * + * Now consider deletions: + * + * If VC = CT/- then again the same logic applies and this returns true + * The case of CTA/- makes no sense because it doesn't actually match the reference bases. + * + * The logic of this function is pretty simple. Take all of the non-null alleles in VC. For + * each insertion allele of n bases, check if that allele matches the next n reference bases. + * For each deletion allele of n bases, check if this matches the reference bases at n - 2 n, + * as it must necessarily match the first n bases. If this test returns true for all + * alleles you are a tandem repeat, otherwise you are not. + * + * @param vc + * @param refBasesStartingAtVCWithPad not this is assumed to include the PADDED reference + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static boolean isTandemRepeat(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return false; + + final Allele ref = vc.getReference(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + if ( ! isRepeatAllele(ref, allele, refBasesStartingAtVCWithoutPad) ) + return false; + } + + // we've passed all of the tests, so we are a repeat + return true; + } + + /** + * + * @param vc + * @param refBasesStartingAtVCWithPad + * @return + */ + @Requires({"vc != null", "refBasesStartingAtVCWithPad != null && refBasesStartingAtVCWithPad.length > 0"}) + public static Pair,byte[]> getNumTandemRepeatUnits(final VariantContext vc, final byte[] refBasesStartingAtVCWithPad) { + final boolean VERBOSE = false; + final String refBasesStartingAtVCWithoutPad = new String(refBasesStartingAtVCWithPad).substring(1); + if ( ! vc.isIndel() ) // only indels are tandem repeats + return null; + + final Allele refAllele = vc.getReference(); + final byte[] refAlleleBases = Arrays.copyOfRange(refAllele.getBases(), 1, refAllele.length()); + + byte[] repeatUnit = null; + final ArrayList lengths = new ArrayList<>(); + + for ( final Allele allele : vc.getAlternateAlleles() ) { + Pair result = getNumTandemRepeatUnits(refAlleleBases, Arrays.copyOfRange(allele.getBases(), 1, allele.length()), refBasesStartingAtVCWithoutPad.getBytes()); + + final int[] repetitionCount = result.first; + // repetition count = 0 means allele is not a tandem expansion of context + if (repetitionCount[0] == 0 || repetitionCount[1] == 0) + return null; + + if (lengths.size() == 0) { + lengths.add(repetitionCount[0]); // add ref allele length only once + } + lengths.add(repetitionCount[1]); // add this alt allele's length + + repeatUnit = result.second; + if (VERBOSE) { + System.out.println("RefContext:"+refBasesStartingAtVCWithoutPad); + System.out.println("Ref:"+refAllele.toString()+" Count:" + String.valueOf(repetitionCount[0])); + System.out.println("Allele:"+allele.toString()+" Count:" + String.valueOf(repetitionCount[1])); + System.out.println("RU:"+new String(repeatUnit)); + } + } + + return new Pair, byte[]>(lengths,repeatUnit); + } + + public static Pair getNumTandemRepeatUnits(final byte[] refBases, final byte[] altBases, final byte[] remainingRefContext) { + /* we can't exactly apply same logic as in basesAreRepeated() to compute tandem unit and number of repeated units. + Consider case where ref =ATATAT and we have an insertion of ATAT. Natural description is (AT)3 -> (AT)2. + */ + + byte[] longB; + // find first repeat unit based on either ref or alt, whichever is longer + if (altBases.length > refBases.length) + longB = altBases; + else + longB = refBases; + + // see if non-null allele (either ref or alt, whichever is longer) can be decomposed into several identical tandem units + // for example, -*,CACA needs to first be decomposed into (CA)2 + final int repeatUnitLength = findRepeatedSubstring(longB); + final byte[] repeatUnit = Arrays.copyOf(longB, repeatUnitLength); + + final int[] repetitionCount = new int[2]; + // look for repetitions forward on the ref bases (i.e. starting at beginning of ref bases) + int repetitionsInRef = findNumberOfRepetitions(repeatUnit, refBases, true); + repetitionCount[0] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(refBases, remainingRefContext), true)-repetitionsInRef; + repetitionCount[1] = findNumberOfRepetitions(repeatUnit, ArrayUtils.addAll(altBases, remainingRefContext), true)-repetitionsInRef; + + return new Pair<>(repetitionCount, repeatUnit); + + } + + /** + * Find out if a string can be represented as a tandem number of substrings. + * For example ACTACT is a 2-tandem of ACT, + * but ACTACA is not. + * + * @param bases String to be tested + * @return Length of repeat unit, if string can be represented as tandem of substring (if it can't + * be represented as one, it will be just the length of the input string) + */ + public static int findRepeatedSubstring(byte[] bases) { + + int repLength; + for (repLength=1; repLength <=bases.length; repLength++) { + final byte[] candidateRepeatUnit = Arrays.copyOf(bases,repLength); + boolean allBasesMatch = true; + for (int start = repLength; start < bases.length; start += repLength ) { + // check that remaining of string is exactly equal to repeat unit + final byte[] basePiece = Arrays.copyOfRange(bases,start,start+candidateRepeatUnit.length); + if (!Arrays.equals(candidateRepeatUnit, basePiece)) { + allBasesMatch = false; + break; + } + } + if (allBasesMatch) + return repLength; + } + + return repLength; + } + + /** + * Helper routine that finds number of repetitions a string consists of. + * For example, for string ATAT and repeat unit AT, number of repetitions = 2 + * @param repeatUnit Substring + * @param testString String to test + * @oaram lookForward Look for repetitions forward (at beginning of string) or backward (at end of string) + * @return Number of repetitions (0 if testString is not a concatenation of n repeatUnit's + */ + public static int findNumberOfRepetitions(byte[] repeatUnit, byte[] testString, boolean lookForward) { + int numRepeats = 0; + if (lookForward) { + // look forward on the test string + for (int start = 0; start < testString.length; start += repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + // look backward. For example, if repeatUnit = AT and testString = GATAT, number of repeat units is still 2 + // look forward on the test string + for (int start = testString.length - repeatUnit.length; start >= 0; start -= repeatUnit.length) { + int end = start + repeatUnit.length; + byte[] unit = Arrays.copyOfRange(testString,start, end); + if(Arrays.equals(unit,repeatUnit)) + numRepeats++; + else + break; + } + return numRepeats; + } + + /** + * Helper function for isTandemRepeat that checks that allele matches somewhere on the reference + * @param ref + * @param alt + * @param refBasesStartingAtVCWithoutPad + * @return + */ + protected static boolean isRepeatAllele(final Allele ref, final Allele alt, final String refBasesStartingAtVCWithoutPad) { + if ( ! Allele.oneIsPrefixOfOther(ref, alt) ) + return false; // we require one allele be a prefix of another + + if ( ref.length() > alt.length() ) { // we are a deletion + return basesAreRepeated(ref.getBaseString(), alt.getBaseString(), refBasesStartingAtVCWithoutPad, 2); + } else { // we are an insertion + return basesAreRepeated(alt.getBaseString(), ref.getBaseString(), refBasesStartingAtVCWithoutPad, 1); + } + } + + protected static boolean basesAreRepeated(final String l, final String s, final String ref, final int minNumberOfMatches) { + final String potentialRepeat = l.substring(s.length()); // skip s bases + + for ( int i = 0; i < minNumberOfMatches; i++) { + final int start = i * potentialRepeat.length(); + final int end = (i+1) * potentialRepeat.length(); + if ( ref.length() < end ) + return false; // we ran out of bases to test + final String refSub = ref.substring(start, end); + if ( ! refSub.equals(potentialRepeat) ) + return false; // repeat didn't match, fail + } + + return true; // we passed all tests, we matched + } + + public enum GenotypeAssignmentMethod { + /** + * set all of the genotype GT values to NO_CALL + */ + SET_TO_NO_CALL, + + /** + * Use the subsetted PLs to greedily assigned genotypes + */ + USE_PLS_TO_ASSIGN, + + /** + * Try to match the original GT calls, if at all possible + * + * Suppose I have 3 alleles: A/B/C and the following samples: + * + * original_GT best_match to A/B best_match to A/C + * S1 => A/A A/A A/A + * S2 => A/B A/B A/A + * S3 => B/B B/B A/A + * S4 => B/C A/B A/C + * S5 => C/C A/A C/C + * + * Basically, all alleles not in the subset map to ref. It means that het-alt genotypes + * when split into 2 bi-allelic variants will be het in each, which is good in some cases, + * rather than the undetermined behavior when using the PLs to assign, which could result + * in hom-var or hom-ref for each, depending on the exact PL values. + */ + BEST_MATCH_TO_ORIGINAL, + + /** + * do not even bother changing the GTs + */ + DO_NOT_ASSIGN_GENOTYPES + } + + /** + * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) + * + * @param vc variant context with genotype likelihoods + * @param allelesToUse which alleles from the vc are okay to use; *** must be in the same relative order as those in the original VC *** + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext + */ + public static GenotypesContext subsetDiploidAlleles(final VariantContext vc, + final List allelesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + if ( allelesToUse.get(0).isNonReference() ) throw new IllegalArgumentException("First allele must be the reference allele"); + if ( allelesToUse.size() == 1 ) throw new IllegalArgumentException("Cannot subset to only 1 alt allele"); + + // optimization: if no input genotypes, just exit + if (vc.getGenotypes().isEmpty()) return GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(vc, allelesToUse); + + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(vc.getGenotypes(), vc, allelesToUse, likelihoodIndexesToUse, assignGenotypes); + } + + /** + * Figure out which likelihood indexes to use for a selected down set of alleles + * + * @param originalVC the original VariantContext + * @param allelesToUse the subset of alleles to use + * @return a list of PL indexes to use or null if none + */ + private static List determineLikelihoodIndexesToUse(final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // an optimization: if we are supposed to use all (or none in the case of a ref call) of the alleles, + // then we can keep the PLs as is; otherwise, we determine which ones to keep + if ( MathUtils.countOccurrences(true, alleleIndexesToUse) == alleleIndexesToUse.length ) + return null; + + return getLikelihoodIndexes(originalVC, alleleIndexesToUse); + } + + /** + * Get the actual likelihoods indexes to use given the corresponding allele indexes + * + * @param originalVC the original VariantContext + * @param alleleIndexesToUse the bitset representing the alleles to use (@see #getAlleleIndexBitset) + * @return a non-null List + */ + private static List getLikelihoodIndexes(final VariantContext originalVC, final boolean[] alleleIndexesToUse) { + + final List result = new ArrayList<>(30); + + // numLikelihoods takes total # of alleles. Use default # of chromosomes (ploidy) = 2 + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(originalVC.getNAlleles(), DEFAULT_PLOIDY); + + for ( int PLindex = 0; PLindex < numLikelihoods; PLindex++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + // consider this entry only if both of the alleles are good + if ( alleleIndexesToUse[alleles.alleleIndex1] && alleleIndexesToUse[alleles.alleleIndex2] ) + result.add(PLindex); + } + + return result; + } + + /** + * Given an original VariantContext and a list of alleles from that VC to keep, + * returns a bitset representing which allele indexes should be kept + * + * @param originalVC the original VC + * @param allelesToKeep the list of alleles to keep + * @return non-null bitset + */ + private static boolean[] getAlleleIndexBitset(final VariantContext originalVC, final List allelesToKeep) { + final int numOriginalAltAlleles = originalVC.getNAlleles() - 1; + final boolean[] alleleIndexesToKeep = new boolean[numOriginalAltAlleles + 1]; + + // the reference Allele is definitely still used + alleleIndexesToKeep[0] = true; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) { + if ( allelesToKeep.contains(originalVC.getAlternateAllele(i)) ) + alleleIndexesToKeep[i+1] = true; + } + + return alleleIndexesToKeep; + } + + /** + * Create the new GenotypesContext with the subsetted PLs and ADs + * + * @param originalGs the original GenotypesContext + * @param vc the original VariantContext + * @param allelesToUse the actual alleles to use with the new Genotypes + * @param likelihoodIndexesToUse the indexes in the PL to use given the allelesToUse (@see #determineLikelihoodIndexesToUse()) + * @param assignGenotypes assignment strategy for the (subsetted) PLs + * @return a new non-null GenotypesContext + */ + private static GenotypesContext createGenotypesWithSubsettedLikelihoods(final GenotypesContext originalGs, + final VariantContext vc, + final List allelesToUse, + final List likelihoodIndexesToUse, + final GenotypeAssignmentMethod assignGenotypes) { + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // make sure we are seeing the expected number of likelihoods per sample + final int expectedNumLikelihoods = GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), 2); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + + // create the new genotypes + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); + final GenotypeBuilder gb = new GenotypeBuilder(g); + + // create the new likelihoods array from the alleles we are allowed to use + double[] newLikelihoods; + if ( !g.hasLikelihoods() ) { + // we don't have any likelihoods, so we null out PLs and make G ./. + newLikelihoods = null; + gb.noPL(); + } else { + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + if ( likelihoodIndexesToUse == null ) { + newLikelihoods = originalLikelihoods; + } else if ( originalLikelihoods.length != expectedNumLikelihoods ) { + logger.debug("Wrong number of likelihoods in sample " + g.getSampleName() + " at " + vc + " got " + g.getLikelihoodsString() + " but expected " + expectedNumLikelihoods); + newLikelihoods = null; + } else { + newLikelihoods = new double[likelihoodIndexesToUse.size()]; + int newIndex = 0; + for ( final int oldIndex : likelihoodIndexesToUse ) + newLikelihoods[newIndex++] = originalLikelihoods[oldIndex]; + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) + gb.noPL(); + else + gb.PL(newLikelihoods); + } + + updateGenotypeAfterSubsetting(g.getAlleles(), gb, assignGenotypes, newLikelihoods, allelesToUse); + newGTs.add(gb.make()); + } + + return fixADFromSubsettedAlleles(newGTs, vc, allelesToUse); + } + + private static boolean likelihoodsAreUninformative(final double[] likelihoods) { + return MathUtils.sum(likelihoods) > SUM_GL_THRESH_NOCALL; + } + + /** + * Add the genotype call (GT) field to GenotypeBuilder using the requested algorithm assignmentMethod + * + * @param originalGT the original genotype calls, cannot be null + * @param gb the builder where we should put our newly called alleles, cannot be null + * @param assignmentMethod the method to use to do the assignment, cannot be null + * @param newLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null + * @param allelesToUse the alleles we are using for our subsetting + */ + public static void updateGenotypeAfterSubsetting(final List originalGT, + final GenotypeBuilder gb, + final GenotypeAssignmentMethod assignmentMethod, + final double[] newLikelihoods, + final List allelesToUse) { + switch ( assignmentMethod ) { + case DO_NOT_ASSIGN_GENOTYPES: + break; + case SET_TO_NO_CALL: + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + break; + case USE_PLS_TO_ASSIGN: + if ( newLikelihoods == null || likelihoodsAreUninformative(newLikelihoods) ) { + // if there is no mass on the (new) likelihoods, then just no-call the sample + gb.alleles(NO_CALL_ALLELES); + gb.noGQ(); + } else { + // find the genotype with maximum likelihoods + final int PLindex = MathUtils.maxElementIndex(newLikelihoods); + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + gb.alleles(Arrays.asList(allelesToUse.get(alleles.alleleIndex1), allelesToUse.get(alleles.alleleIndex2))); + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } + break; + case BEST_MATCH_TO_ORIGINAL: + final List best = new LinkedList<>(); + final Allele ref = allelesToUse.get(0); // WARNING -- should be checked in input argument + for ( final Allele originalAllele : originalGT ) { + best.add(allelesToUse.contains(originalAllele) ? originalAllele : ref); + } + gb.noGQ(); + gb.noPL(); + gb.alleles(best); + break; + } + } + + /** + * Subset the samples in VC to reference only information with ref call alleles + * + * Preserves DP if present + * + * @param vc the variant context to subset down to + * @param ploidy ploidy to use if a genotype doesn't have any alleles + * @return a GenotypesContext + */ + public static GenotypesContext subsetToRefOnly(final VariantContext vc, final int ploidy) { + if ( vc == null ) throw new IllegalArgumentException("vc cannot be null"); + if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be >= 1 but got " + ploidy); + + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // optimization: if no input genotypes, just exit + if (oldGTs.isEmpty()) return oldGTs; + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(oldGTs.size()); + + final Allele ref = vc.getReference(); + final List diploidRefAlleles = Arrays.asList(ref, ref); + + // create the new genotypes + for ( final Genotype g : vc.getGenotypes() ) { + final int gPloidy = g.getPloidy() == 0 ? ploidy : g.getPloidy(); + final List refAlleles = gPloidy == 2 ? diploidRefAlleles : Collections.nCopies(gPloidy, ref); + final GenotypeBuilder gb = new GenotypeBuilder(g.getSampleName(), refAlleles); + if ( g.hasDP() ) gb.DP(g.getDP()); + if ( g.hasGQ() ) gb.GQ(g.getGQ()); + newGTs.add(gb.make()); + } + + return newGTs; + } + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param vc variant context with genotype likelihoods + * @return genotypes context + */ + public static GenotypesContext assignDiploidGenotypes(final VariantContext vc) { + return subsetDiploidAlleles(vc, vc.getAlleles(), GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc) { + return splitVariantContextToBiallelics(vc, false, GenotypeAssignmentMethod.SET_TO_NO_CALL); + } + + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @param trimLeft if true, we will also left trim alleles, potentially moving the resulting vcs forward on the genome + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc, final boolean trimLeft, final GenotypeAssignmentMethod genotypeAssignmentMethod) { + if ( ! vc.isVariant() || vc.isBiallelic() ) + // non variant or biallelics already satisfy the contract + return Collections.singletonList(vc); + else { + final List biallelics = new LinkedList<>(); + + for ( final Allele alt : vc.getAlternateAlleles() ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); + final List alleles = Arrays.asList(vc.getReference(), alt); + builder.alleles(alleles); + builder.genotypes(subsetDiploidAlleles(vc, alleles, genotypeAssignmentMethod)); + VariantContextUtils.calculateChromosomeCounts(builder, true); + final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true); + biallelics.add(trimmed); + } + + return biallelics; + } + } + + public static Genotype removePLsAndAD(final Genotype g) { + return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; + } + + //TODO consider refactor variant-context merging code so that we share as much as possible between + //TODO simpleMerge and referenceConfidenceMerge + //TODO likely using a separate helper class or hierarchy. + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); + return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); + } + + /** + * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. + * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with + * the sample name. + * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use + * SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge. + * + * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ + * + * @param unsortedVCs collection of unsorted VCs + * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs + * @param filteredRecordMergeType merge type for filtered records + * @param genotypeMergeOptions merge option for genotypes + * @param annotateOrigin should we annotate the set it came from? + * @param printMessages should we print messages? + * @param setKey the key name of the set + * @param filteredAreUncalled are filtered records uncalled? + * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? + * @return new VariantContext representing the merge of unsortedVCs + */ + public static VariantContext simpleMerge(final Collection unsortedVCs, + final List priorityListOfVCs, + final int originalNumOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { + if ( unsortedVCs == null || unsortedVCs.size() == 0 ) + return null; + + if (priorityListOfVCs != null && originalNumOfVCs != priorityListOfVCs.size()) + throw new IllegalArgumentException("the number of the original VariantContexts must be the same as the number of VariantContexts in the priority list"); + + if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) + throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); + + final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); + // Make sure all variant contexts are padded with reference base in case of indels if necessary + List VCs = new ArrayList<>(); + + for (final VariantContext vc : preFilteredVCs) { + if ( ! filteredAreUncalled || vc.isNotFiltered() ) + VCs.add(vc); + } + + if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled + return null; + + // establish the baseline info from the first VC + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + final Allele refAllele = determineReferenceAllele(VCs); + + final LinkedHashSet alleles = new LinkedHashSet<>(); + final Set filters = new HashSet<>(); + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set variantSources = new HashSet<>(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + + VariantContext longestVC = first; + int depth = 0; + int maxAC = -1; + final Map attributesWithMaxAC = new LinkedHashMap<>(); + double log10PError = CommonInfo.NO_LOG10_PERROR; + boolean anyVCHadFiltersApplied = false; + VariantContext vcWithMaxAC = null; + GenotypesContext genotypes = GenotypesContext.create(); + + // counting the number of filtered and variant VCs + int nFiltered = 0; + + boolean remapped = false; + + // cycle through and add info from the other VCs, making sure the loc/reference matches + for ( final VariantContext vc : VCs ) { + if ( longestVC.getStart() != vc.getStart() ) + throw new IllegalStateException("BUG: attempting to merge VariantContexts with different start sites: first="+ first.toString() + " second=" + vc.toString()); + + if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) + longestVC = vc; // get the longest location + + nFiltered += vc.isFiltered() ? 1 : 0; + if ( vc.isVariant() ) variantSources.add(vc.getSource()); + + AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); + remapped = remapped || alleleMapping.needsRemapping(); + + alleles.addAll(alleleMapping.values()); + + mergeGenotypes(genotypes, vc, alleleMapping, genotypeMergeOptions == GenotypeMergeType.UNIQUIFY); + + // We always take the QUAL of the first VC with a non-MISSING qual for the combined value + if ( log10PError == CommonInfo.NO_LOG10_PERROR ) + log10PError = vc.getLog10PError(); + + filters.addAll(vc.getFilters()); + anyVCHadFiltersApplied |= vc.filtersWereApplied(); + + // + // add attributes + // + // special case DP (add it up) and ID (just preserve it) + // + if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + if ( vc.hasID() ) rsIDs.add(vc.getID()); + if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); + // lets see if the string contains a "," separator + if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { + final List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); + for (final String alleleCount : alleleCountArray) { + final int ac = Integer.valueOf(alleleCount.trim()); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } else { + final int ac = Integer.valueOf(rawAlleleCounts); + if (ac > maxAC) { + maxAC = ac; + vcWithMaxAC = vc; + } + } + } + + for (final Map.Entry p : vc.getAttributes().entrySet()) { + final String key = p.getKey(); + final Object value = p.getValue(); + // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); + } + } + } + } + + // if we have more alternate alleles in the merged VC than in one or more of the + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD + for ( final VariantContext vc : VCs ) { + if (vc.getAlleles().size() == 1) + continue; + if ( hasPLIncompatibleAlleles(alleles, vc.getAlleles())) { + if ( ! genotypes.isEmpty() ) { + logger.debug(String.format("Stripping PLs at %s:%d-%d due to incompatible alleles merged=%s vs. single=%s", + vc.getChr(), vc.getStart(), vc.getEnd(), alleles, vc.getAlleles())); + } + genotypes = stripPLsAndAD(genotypes); + // this will remove stale AC,AF attributed from vc + VariantContextUtils.calculateChromosomeCounts(vc, attributes, true); + break; + } + } + + // take the VC with the maxAC and pull the attributes into a modifiable map + if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); + } + + // if at least one record was unfiltered and we want a union, clear all of the filters + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) + filters.clear(); + + + if ( annotateOrigin ) { // we care about where the call came from + String setValue; + if ( nFiltered == 0 && variantSources.size() == originalNumOfVCs ) // nothing was unfiltered + setValue = MERGE_INTERSECTION; + else if ( nFiltered == VCs.size() ) // everything was filtered out + setValue = MERGE_FILTER_IN_ALL; + else if ( variantSources.isEmpty() ) // everyone was reference + setValue = MERGE_REF_IN_ALL; + else { + final LinkedHashSet s = new LinkedHashSet<>(); + for ( final VariantContext vc : VCs ) + if ( vc.isVariant() ) + s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); + setValue = Utils.join("-", s); + } + + if ( setKey != null ) { + attributes.put(setKey, setValue); + if( mergeInfoWithMaxAC && vcWithMaxAC != null ) { + attributesWithMaxAC.put(setKey, setValue); + } + } + } + + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); + + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); + + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID); + builder.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()); + builder.alleles(alleles); + builder.genotypes(genotypes); + builder.log10PError(log10PError); + if ( anyVCHadFiltersApplied ) { + builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); + } + builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); + + // Trim the padded bases of all alleles if necessary + final VariantContext merged = builder.make(); + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); + return merged; + } + + //TODO as part of a larger refactoring effort remapAlleles can be merged with createAlleleMapping. + + public static GenotypesContext stripPLsAndAD(final GenotypesContext genotypes) { + final GenotypesContext newGs = GenotypesContext.create(genotypes.size()); + + for ( final Genotype g : genotypes ) { + newGs.add(removePLsAndAD(g)); + } + + return newGs; + } + + /** + * Updates the PLs and AD of the Genotypes in the newly selected VariantContext to reflect the fact that some alleles + * from the original VariantContext are no longer present. + * + * @param selectedVC the selected (new) VariantContext + * @param originalVC the original VariantContext + * @return a new non-null GenotypesContext + */ + public static GenotypesContext updatePLsAndAD(final VariantContext selectedVC, final VariantContext originalVC) { + final int numNewAlleles = selectedVC.getAlleles().size(); + final int numOriginalAlleles = originalVC.getAlleles().size(); + + // if we have more alternate alleles in the selected VC than in the original VC, then something is wrong + if ( numNewAlleles > numOriginalAlleles ) + throw new IllegalArgumentException("Attempting to fix PLs and AD from what appears to be a *combined* VCF and not a selected one"); + + final GenotypesContext oldGs = selectedVC.getGenotypes(); + + // if we have the same number of alternate alleles in the selected VC as in the original VC, then we don't need to fix anything + if ( numNewAlleles == numOriginalAlleles ) + return oldGs; + + return fixGenotypesFromSubsettedAlleles(oldGs, originalVC, selectedVC.getAlleles()); + } + + /** + * Fix the PLs and ADs for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixGenotypesFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final List likelihoodIndexesToUse = determineLikelihoodIndexesToUse(originalVC, allelesToUse); + + // create the new genotypes + return createGenotypesWithSubsettedLikelihoods(originalGs, originalVC, allelesToUse, likelihoodIndexesToUse, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES); + } + + /** + * Fix the AD for the GenotypesContext of a VariantContext that has been subset + * + * @param originalGs the original GenotypesContext + * @param originalVC the original VariantContext + * @param allelesToUse the new (sub)set of alleles to use + * @return a new non-null GenotypesContext + */ + static private GenotypesContext fixADFromSubsettedAlleles(final GenotypesContext originalGs, final VariantContext originalVC, final List allelesToUse) { + + // the bitset representing the allele indexes we want to keep + final boolean[] alleleIndexesToUse = getAlleleIndexBitset(originalVC, allelesToUse); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(originalGs.size()); + + // the samples + final List sampleIndices = originalGs.getSampleNamesOrderedByName(); + + // create the new genotypes + for ( int k = 0; k < originalGs.size(); k++ ) { + final Genotype g = originalGs.get(sampleIndices.get(k)); + newGTs.add(fixAD(g, alleleIndexesToUse, allelesToUse.size())); + } + + return newGTs; + } + + /** + * Fix the AD for the given Genotype + * + * @param genotype the original Genotype + * @param alleleIndexesToUse a bitset describing whether or not to keep a given index + * @param nAllelesToUse how many alleles we are keeping + * @return a non-null Genotype + */ + private static Genotype fixAD(final Genotype genotype, final boolean[] alleleIndexesToUse, final int nAllelesToUse) { + // if it ain't broke don't fix it + if ( !genotype.hasAD() ) + return genotype; + + final GenotypeBuilder builder = new GenotypeBuilder(genotype); + + final int[] oldAD = genotype.getAD(); + if ( oldAD.length != alleleIndexesToUse.length ) { + builder.noAD(); + } else { + final int[] newAD = new int[nAllelesToUse]; + int currentIndex = 0; + for ( int i = 0; i < oldAD.length; i++ ) { + if ( alleleIndexesToUse[i] ) + newAD[currentIndex++] = oldAD[i]; + } + builder.AD(newAD); + } + return builder.make(); + } + + private static Allele determineReferenceAllele(final List VCs) { + return determineReferenceAllele(VCs, null); + } + + public static boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { + return loc == null || loc.getStart() == vc.getStart(); + } + + static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final LinkedHashSet allAlleles) { + if ( refAllele.equals(vc.getReference()) ) + return new AlleleMapper(vc); + else { + final Map map = createAlleleMapping(refAllele, vc, allAlleles); + map.put(vc.getReference(), refAllele); + return new AlleleMapper(map); + } + } + + //TODO as part of a larger refactoring effort {@link #createAlleleMapping} can be merged with {@link ReferenceConfidenceVariantContextMerger#remapAlleles}. + /** + * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele + * + * The refAllele is the longest reference allele seen at this start site. + * So imagine it is: + * refAllele: ACGTGA + * myRef: ACGT + * myAlt: A + * + * We need to remap all of the alleles in vc to include the extra GA so that + * myRef => refAllele and myAlt => AGA + * + * @param refAllele the new (extended) reference allele + * @param oneVC the Variant Context to extend + * @param currentAlleles the list of alleles already created + * @return a non-null mapping of original alleles to new (extended) ones + */ + private static Map createAlleleMapping(final Allele refAllele, + final VariantContext oneVC, + final Collection currentAlleles) { + final Allele myRef = oneVC.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + + final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + + final Map map = new HashMap<>(); + for ( final Allele a : oneVC.getAlternateAlleles() ) { + if ( isUsableAlternateAllele(a) ) { + Allele extended = Allele.extend(a, extraBases); + for ( final Allele b : currentAlleles ) + if ( extended.equals(b) ) + extended = b; + map.put(a, extended); + } + } + + return map; + } + + static private boolean isUsableAlternateAllele(final Allele allele) { + return ! (allele.isReference() || allele.isSymbolic() ); + } + + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { + if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) + throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); + + if ( priorityListOfVCs == null || mergeOption == GenotypeMergeType.UNSORTED ) + return new ArrayList<>(unsortedVCs); + else { + ArrayList sorted = new ArrayList<>(unsortedVCs); + Collections.sort(sorted, new CompareByPriority(priorityListOfVCs)); + return sorted; + } + } + + private static void mergeGenotypes(GenotypesContext mergedGenotypes, VariantContext oneVC, AlleleMapper alleleMapping, boolean uniquifySamples) { + //TODO: should we add a check for cases when the genotypeMergeOption is REQUIRE_UNIQUE + for ( final Genotype g : oneVC.getGenotypes() ) { + final String name = mergedSampleName(oneVC.getSource(), g.getSampleName(), uniquifySamples); + if ( ! mergedGenotypes.containsSample(name) ) { + // only add if the name is new + Genotype newG = g; + + if ( uniquifySamples || alleleMapping.needsRemapping() ) { + final List alleles = alleleMapping.needsRemapping() ? alleleMapping.remap(g.getAlleles()) : g.getAlleles(); + newG = new GenotypeBuilder(g).name(name).alleles(alleles).make(); + } + + mergedGenotypes.add(newG); + } + } + } + + /** + * Cached NO_CALL immutable lists where the position ith contains the list with i elements. + */ + private static List[] NOCALL_LISTS = new List[] { + Collections.emptyList(), + Collections.singletonList(Allele.NO_CALL), + Collections.nCopies(2,Allele.NO_CALL) + }; + + /** + * Synchronized code to ensure that {@link #NOCALL_LISTS} has enough entries beyod the requested ploidy + * @param capacity the requested ploidy. + */ + private static synchronized void ensureNoCallListsCapacity(final int capacity) { + final int currentCapacity = NOCALL_LISTS.length - 1; + if (currentCapacity >= capacity) + return; + NOCALL_LISTS = Arrays.copyOf(NOCALL_LISTS,Math.max(capacity,currentCapacity << 1) + 1); + for (int i = currentCapacity + 1; i < NOCALL_LISTS.length; i++) + NOCALL_LISTS[i] = Collections.nCopies(i,Allele.NO_CALL); + } + + /** + * Returns a {@link Allele#NO_CALL NO_CALL} allele list provided the ploidy. + * + * @param ploidy the required ploidy. + * + * @return never {@code null}, but an empty list if {@code ploidy} is equal or less than 0. The returned list + * might or might not be mutable. + */ + public static List noCallAlleles(final int ploidy) { + if (NOCALL_LISTS.length <= ploidy) + ensureNoCallListsCapacity(ploidy); + return NOCALL_LISTS[ploidy]; + } + + + /** + * This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex() + * + * @param originalIndex1 the index of the first allele + * @param originalIndex2 the index of the second allele + * @return the PL index + */ + protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) { + // we need to make sure they are ordered correctly + return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2); + } + + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { + return uniquify ? sampleName + "." + trackName : sampleName; + } + + /** + * Trim the alleles in inputVC from the reverse direction + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up + */ + public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { + return trimAlleles(inputVC, false, true); + } + + /** + * Trim the alleles in inputVC from the forward direction + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @return a non-null VariantContext (may be == to inputVC) with alleles trimmed up + */ + public static VariantContext forwardTrimAlleles( final VariantContext inputVC ) { + return trimAlleles(inputVC, true, false); + } + + /** + * Trim the alleles in inputVC forward and reverse, as requested + * + * @param inputVC a non-null input VC whose alleles might need a haircut + * @param trimForward should we trim up the alleles from the forward direction? + * @param trimReverse should we trim up the alleles from the reverse direction? + * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles + */ + @Ensures("result != null") + public static VariantContext trimAlleles(final VariantContext inputVC, final boolean trimForward, final boolean trimReverse) { + if ( inputVC == null ) throw new IllegalArgumentException("inputVC cannot be null"); + + if ( inputVC.getNAlleles() <= 1 || inputVC.isSNP() ) + return inputVC; + + // see whether we need to trim common reference base from all alleles + final int revTrim = trimReverse ? computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes()) : 0; + final VariantContext revTrimVC = trimAlleles(inputVC, -1, revTrim); + final int fwdTrim = trimForward ? computeForwardClipping(revTrimVC.getAlleles()) : -1; + final VariantContext vc= trimAlleles(revTrimVC, fwdTrim, 0); + return vc; + } + + /** + * Trim up alleles in inputVC, cutting out all bases up to fwdTrimEnd inclusive and + * the last revTrim bases from the end + * + * @param inputVC a non-null input VC + * @param fwdTrimEnd bases up to this index (can be -1) will be removed from the start of all alleles + * @param revTrim the last revTrim bases of each allele will be clipped off as well + * @return a non-null VariantContext (may be == to inputVC) with trimmed up alleles + */ + @Requires({"inputVC != null"}) + @Ensures("result != null") + protected static VariantContext trimAlleles(final VariantContext inputVC, + final int fwdTrimEnd, + final int revTrim) { + if( fwdTrimEnd == -1 && revTrim == 0 ) // nothing to do, so just return inputVC unmodified + return inputVC; + + final List alleles = new LinkedList<>(); + final Map originalToTrimmedAlleleMap = new HashMap<>(); + + for (final Allele a : inputVC.getAlleles()) { + if (a.isSymbolic()) { + alleles.add(a); + originalToTrimmedAlleleMap.put(a, a); + } else { + // get bases for current allele and create a new one with trimmed bases + final byte[] newBases = Arrays.copyOfRange(a.getBases(), fwdTrimEnd+1, a.length()-revTrim); + final Allele trimmedAllele = Allele.create(newBases, a.isReference()); + alleles.add(trimmedAllele); + originalToTrimmedAlleleMap.put(a, trimmedAllele); + } + } + + // now we can recreate new genotypes with trimmed alleles + final AlleleMapper alleleMapper = new AlleleMapper(originalToTrimmedAlleleMap); + final GenotypesContext genotypes = updateGenotypesWithMappedAlleles(inputVC.getGenotypes(), alleleMapper); + + final int start = inputVC.getStart() + (fwdTrimEnd + 1); + final VariantContextBuilder builder = new VariantContextBuilder(inputVC); + builder.start(start); + builder.stop(start + alleles.get(0).length() - 1); + builder.alleles(alleles); + builder.genotypes(genotypes); + return builder.make(); + } + + @Requires("originalGenotypes != null && alleleMapper != null") + protected static GenotypesContext updateGenotypesWithMappedAlleles(final GenotypesContext originalGenotypes, final AlleleMapper alleleMapper) { + final GenotypesContext updatedGenotypes = GenotypesContext.create(originalGenotypes.size()); + + for ( final Genotype genotype : originalGenotypes ) { + final List updatedAlleles = alleleMapper.remap(genotype.getAlleles()); + updatedGenotypes.add(new GenotypeBuilder(genotype).alleles(updatedAlleles).make()); + } + + return updatedGenotypes; + } + + public static int computeReverseClipping(final List unclippedAlleles, final byte[] ref) { + int clipping = 0; + boolean stillClipping = true; + + while ( stillClipping ) { + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + continue; + + // we need to ensure that we don't reverse clip out all of the bases from an allele because we then will have the wrong + // position set for the VariantContext (although it's okay to forward clip it all out, because the position will be fine). + if ( a.length() - clipping == 0 ) + return clipping - 1; + + if ( a.length() - clipping <= 0 || a.length() == 0 ) { + stillClipping = false; + } + else if ( ref.length == clipping ) { + return -1; + } + else if ( a.getBases()[a.length()-clipping-1] != ref[ref.length-clipping-1] ) { + stillClipping = false; + } + } + if ( stillClipping ) + clipping++; + } + + return clipping; + } + + /** + * Clip out any unnecessary bases off the front of the alleles + * + * The VCF spec represents alleles as block substitutions, replacing AC with A for a + * 1 bp deletion of the C. However, it's possible that we'd end up with alleles that + * contain extra bases on the left, such as GAC/GA to represent the same 1 bp deletion. + * This routine finds an offset among all alleles that can be safely trimmed + * off the left of each allele and still represent the same block substitution. + * + * A/C => A/C + * AC/A => AC/A + * ACC/AC => CC/C + * AGT/CAT => AGT/CAT + * /C => /C + * + * @param unclippedAlleles a non-null list of alleles that we want to clip + * @return the offset into the alleles where we can safely clip, inclusive, or + * -1 if no clipping is tolerated. So, if the result is 0, then we can remove + * the first base of every allele. If the result is 1, we can remove the + * second base. + */ + public static int computeForwardClipping(final List unclippedAlleles) { + // cannot clip unless there's at least 1 alt allele + if ( unclippedAlleles.size() <= 1 ) + return -1; + + // we cannot forward clip any set of alleles containing a symbolic allele + int minAlleleLength = Integer.MAX_VALUE; + for ( final Allele a : unclippedAlleles ) { + if ( a.isSymbolic() ) + return -1; + minAlleleLength = Math.min(minAlleleLength, a.length()); + } + + final byte[] firstAlleleBases = unclippedAlleles.get(0).getBases(); + int indexOflastSharedBase = -1; + + // the -1 to the stop is that we can never clip off the right most base + for ( int i = 0; i < minAlleleLength - 1; i++) { + final byte base = firstAlleleBases[i]; + + for ( final Allele allele : unclippedAlleles ) { + if ( allele.getBases()[i] != base ) + return indexOflastSharedBase; + } + + indexOflastSharedBase = i; + } + + return indexOflastSharedBase; + } + + public static double computeHardyWeinbergPvalue(VariantContext vc) { + if ( vc.getCalledChrCount() == 0 ) + return 0.0; + return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); + } + + public static boolean requiresPaddingBase(final List alleles) { + + // see whether one of the alleles would be null if trimmed through + + for ( final String allele : alleles ) { + if ( allele.isEmpty() ) + return true; + } + + int clipping = 0; + Character currentBase = null; + + while ( true ) { + for ( final String allele : alleles ) { + if ( allele.length() - clipping == 0 ) + return true; + + char myBase = allele.charAt(clipping); + if ( currentBase == null ) + currentBase = myBase; + else if ( currentBase != myBase ) + return false; + } + + clipping++; + currentBase = null; + } + } + + private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { + Map attributes = new HashMap<>(keysToPreserve.size()); + for ( final String key : keysToPreserve ) { + if ( igc.hasAttribute(key) ) + attributes.put(key, igc.getAttribute(key)); + } + return attributes; + } + + /** + * @deprecated use variant context builder version instead + * @param vc the variant context + * @param keysToPreserve the keys to preserve + * @return a pruned version of the original variant context + */ + @Deprecated + public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { + return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); + } + + public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { + final VariantContext vc = builder.make(); + if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); + + // VC info + final Map attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); + + // Genotypes + final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + final GenotypeBuilder gb = new GenotypeBuilder(g); + // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ + gb.noAD().noDP().noPL().noAttributes(); + genotypes.add(gb.make()); + } + + return builder.genotypes(genotypes).attributes(attributes); + } + + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (final Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } + + public static Map> separateVariantContextsByType( final Collection VCs ) { + if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null."); } + + final HashMap> mappedVCs = new HashMap<>(); + for ( final VariantContext vc : VCs ) { + VariantContext.Type vcType = vc.getType(); + + // look at previous variant contexts of different type. If: + // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list + // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) + // c) neither: do nothing, just add vc to its own list + boolean addtoOwnList = true; + for (final VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vcType)) + continue; + + if (!mappedVCs.containsKey(type)) + continue; + + List vcList = mappedVCs.get(type); + for (int k=0; k < vcList.size(); k++) { + VariantContext otherVC = vcList.get(k); + if (allelesAreSubset(otherVC,vc)) { + // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list + vcList.remove(k); + // avoid having empty lists + if (vcList.size() == 0) + mappedVCs.remove(type); + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(otherVC); + break; + } + else if (allelesAreSubset(vc,otherVC)) { + // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own + mappedVCs.get(type).add(vc); + addtoOwnList = false; + break; + } + } + } + if (addtoOwnList) { + if ( !mappedVCs.containsKey(vcType) ) + mappedVCs.put(vcType, new ArrayList()); + mappedVCs.get(vcType).add(vc); + } + } + + return mappedVCs; + } + + public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { + if ( allowedAttributes == null ) + return vc; + + final GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + final Map attrs = new HashMap<>(); + for ( final Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { + if ( allowedAttributes.contains(attr.getKey()) ) + attrs.put(attr.getKey(), attr.getValue()); + } + newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); + } + + return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); + } + + protected static class AlleleMapper { + private VariantContext vc = null; + private Map map = null; + public AlleleMapper(VariantContext vc) { this.vc = vc; } + public AlleleMapper(Map map) { this.map = map; } + public boolean needsRemapping() { return this.map != null; } + public Collection values() { return map != null ? map.values() : vc.getAlleles(); } + public Allele remap(Allele a) { return map != null && map.containsKey(a) ? map.get(a) : a; } + + public List remap(List as) { + List newAs = new ArrayList<>(); + for ( final Allele a : as ) { + //System.out.printf(" Remapping %s => %s%n", a, remap(a)); + newAs.add(remap(a)); + } + return newAs; + } + + /** + * @return the list of unique values + */ + public List getUniqueMappedAlleles() { + if ( map == null ) + return Collections.emptyList(); + return new ArrayList<>(new HashSet<>(map.values())); + } + } + + private static class CompareByPriority implements Comparator, Serializable { + List priorityListOfVCs; + public CompareByPriority(List priorityListOfVCs) { + this.priorityListOfVCs = priorityListOfVCs; + } + + private int getIndex(VariantContext vc) { + int i = priorityListOfVCs.indexOf(vc.getSource()); + if ( i == -1 ) throw new IllegalArgumentException("Priority list " + priorityListOfVCs + " doesn't contain variant context " + vc.getSource()); + return i; + } + + public int compare(VariantContext vc1, VariantContext vc2) { + return Integer.valueOf(getIndex(vc1)).compareTo(getIndex(vc2)); + } + } + + /** + * For testing purposes only. Create a site-only VariantContext at contig:start containing alleles + * + * @param name the name of the VC + * @param contig the contig for the VC + * @param start the start of the VC + * @param alleleStrings a non-null, non-empty list of strings for the alleles. The first will be the ref allele, and others the + * alt. Will compute the stop of the VC from the length of the reference allele + * @return a non-null VariantContext + */ + public static VariantContext makeFromAlleles(final String name, final String contig, final int start, final List alleleStrings) { + if ( alleleStrings == null || alleleStrings.isEmpty() ) + throw new IllegalArgumentException("alleleStrings must be non-empty, non-null list"); + + final List alleles = new LinkedList<>(); + final int length = alleleStrings.get(0).length(); + + boolean first = true; + for ( final String alleleString : alleleStrings ) { + alleles.add(Allele.create(alleleString, first)); + first = false; + } + return new VariantContextBuilder(name, contig, start, start+length-1, alleles).make(); + } + + /** + * Splits the alleles for the provided variant context into its primitive parts. + * Requires that the input VC be bi-allelic, so calling methods should first call splitVariantContextToBiallelics() if needed. + * Currently works only for MNPs. + * + * @param vc the non-null VC to split + * @return a non-empty list of VCs split into primitive parts or the original VC otherwise + */ + public static List splitIntoPrimitiveAlleles(final VariantContext vc) { + if ( vc == null ) + throw new IllegalArgumentException("Trying to break a null Variant Context into primitive parts"); + + if ( !vc.isBiallelic() ) + throw new IllegalArgumentException("Trying to break a multi-allelic Variant Context into primitive parts"); + + // currently only works for MNPs + if ( !vc.isMNP() ) + return Arrays.asList(vc); + + final byte[] ref = vc.getReference().getBases(); + final byte[] alt = vc.getAlternateAllele(0).getBases(); + + if ( ref.length != alt.length ) + throw new IllegalStateException("ref and alt alleles for MNP have different lengths"); + + final List result = new ArrayList<>(ref.length); + + for ( int i = 0; i < ref.length; i++ ) { + + // if the ref and alt bases are different at a given position, create a new SNP record (otherwise do nothing) + if ( ref[i] != alt[i] ) { + + // create the ref and alt SNP alleles + final Allele newRefAllele = Allele.create(ref[i], true); + final Allele newAltAllele = Allele.create(alt[i], false); + + // create a new VariantContext with the new SNP alleles + final VariantContextBuilder newVC = new VariantContextBuilder(vc).start(vc.getStart() + i).stop(vc.getStart() + i).alleles(Arrays.asList(newRefAllele, newAltAllele)); + + // create new genotypes with updated alleles + final Map alleleMap = new HashMap<>(); + alleleMap.put(vc.getReference(), newRefAllele); + alleleMap.put(vc.getAlternateAllele(0), newAltAllele); + final GenotypesContext newGenotypes = updateGenotypesWithMappedAlleles(vc.getGenotypes(), new AlleleMapper(alleleMap)); + + result.add(newVC.genotypes(newGenotypes).make()); + } + } + + if ( result.isEmpty() ) + result.add(vc); + + return result; + } + + /** + * Are vc1 and 2 equal including their position and alleles? + * @param vc1 non-null VariantContext + * @param vc2 non-null VariantContext + * @return true if vc1 and vc2 are equal, false otherwise + */ + public static boolean equalSites(final VariantContext vc1, final VariantContext vc2) { + if ( vc1 == null ) throw new IllegalArgumentException("vc1 cannot be null"); + if ( vc2 == null ) throw new IllegalArgumentException("vc2 cannot be null"); + + if ( vc1.getStart() != vc2.getStart() ) return false; + if ( vc1.getEnd() != vc2.getEnd() ) return false; + if ( ! vc1.getChr().equals(vc2.getChr())) return false; + if ( ! vc1.getAlleles().equals(vc2.getAlleles()) ) return false; + return true; + } + + /** + * Returns the absolute 0-based index of an allele. + * + *

+ * If the allele is equal to the reference, the result is 0, if it equal to the first alternative the result is 1 + * and so forth. + *

+ * Therefore if you want the 0-based index within the alternative alleles you need to do the following: + * + *

+ * You can indicate whether the Java object reference comparator {@code ==} can be safelly used by setting {@code useEquals} to {@code false}. + * + * @param vc the target variant context. + * @param allele the target allele. + * @param ignoreRefState whether the reference states of the allele is important at all. Has no effect if {@code useEquals} is {@code false}. + * @param considerRefAllele whether the reference allele should be considered. You should set it to {@code false} if you are only interested in alternative alleles. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and {@link VariantContext#getNAlleles()} {@code -1} otherwise. + */ + public static int indexOfAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele, final boolean useEquals) { + if (allele == null) throw new IllegalArgumentException(); + return useEquals ? indexOfEqualAllele(vc,allele,ignoreRefState,considerRefAllele) : indexOfSameAllele(vc,allele,considerRefAllele); + } + + /** + * Returns the relative 0-based index of an alternative allele. + *

+ * The the query allele is the same as the first alternative allele, the result is 0, + * if it is equal to the second 1 and so forth. + * + * + *

+ * Notice that the ref-status of the query {@code allele} is ignored. + * + * @param vc the target variant context. + * @param allele the query allele. + * @param useEquals whether equal method should be used in the search: {@link Allele#equals(Allele,boolean)}. + * + * @throws IllegalArgumentException if {@code allele} is {@code null}. + * + * @return {@code -1} if there is no such allele that satify those criteria, a value between 0 and the number + * of alternative alleles - 1. + */ + public static int indexOfAltAllele(final VariantContext vc, final Allele allele, final boolean useEquals) { + final int absoluteIndex = indexOfAllele(vc,allele,true,false,useEquals); + return absoluteIndex == -1 ? -1 : absoluteIndex - 1; + } + + // Impements index search using equals. + private static int indexOfEqualAllele(final VariantContext vc, final Allele allele, final boolean ignoreRefState, + final boolean considerRefAllele) { + int i = 0; + for (final Allele a : vc.getAlleles()) + if (a.equals(allele,ignoreRefState)) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + return -1; + } + + // Implements index search using ==. + private static int indexOfSameAllele(final VariantContext vc, final Allele allele, final boolean considerRefAllele) { + int i = 0; + + for (final Allele a : vc.getAlleles()) + if (a == allele) + return i == 0 ? (considerRefAllele ? 0 : -1) : i; + else + i++; + + return -1; + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/HomoSapiensConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/HomoSapiensConstants.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/variant/HomoSapiensConstants.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/HomoSapiensConstants.java diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/VCIterable.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/VCIterable.java new file mode 100644 index 000000000..3263d9bf6 --- /dev/null +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/VCIterable.java @@ -0,0 +1,92 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.variant; + +import htsjdk.tribble.FeatureCodec; +import htsjdk.tribble.FeatureCodecHeader; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFHeader; +import org.broadinstitute.gatk.utils.collections.Pair; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.Iterator; + +/* +* NOTE: Refactored out of GATKVCFUtils +*/ +public class VCIterable implements Iterable, Iterator { + final SOURCE source; + final FeatureCodec codec; + final VCFHeader header; + + VCIterable(final SOURCE source, final FeatureCodec codec, final VCFHeader header) { + this.source = source; + this.codec = codec; + this.header = header; + } + + /** + * Utility class to read all of the VC records from a file + * + * @param file + * @param codec + * @return + * @throws java.io.IOException + */ + public final static Pair> readAllVCs( final File file, final FeatureCodec codec) throws IOException { + // read in the features + SOURCE source = codec.makeSourceFromStream(new FileInputStream(file)); + FeatureCodecHeader header = codec.readHeader(source); + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + return new Pair<>(vcfHeader, new VCIterable<>(source, codec, vcfHeader)); + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + return ! codec.isDone(source); + } + + @Override + public VariantContext next() { + try { + final VariantContext vc = codec.decode(source); + return vc == null ? null : vc.fullyDecode(header, false); + } catch ( IOException e ) { + throw new RuntimeException(e); + } + } + + @Override + public void remove() { + } +} diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleHeader.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleHeader.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleHeader.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleHeader.java diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleWriter.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleWriter.java similarity index 100% rename from public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleWriter.java rename to public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/wiggle/WiggleWriter.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/AutoFormattingTimeUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/AutoFormattingTimeUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/AutoFormattingTimeUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/AutoFormattingTimeUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java new file mode 100644 index 000000000..5689dbc4c --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseTest.java @@ -0,0 +1,565 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import htsjdk.tribble.Tribble; +import htsjdk.tribble.readers.LineIterator; +import htsjdk.tribble.readers.PositionalBufferedStream; +import htsjdk.tribble.util.TabixUtils; +import htsjdk.variant.bcf2.BCF2Codec; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.PatternLayout; +import org.apache.log4j.spi.LoggingEvent; +import org.broadinstitute.gatk.utils.variant.VCIterable; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.commandline.CommandLineUtils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.io.IOUtils; +import org.testng.Assert; +import org.testng.Reporter; +import org.testng.SkipException; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +/** + * + * User: aaron + * Date: Apr 14, 2009 + * Time: 10:24:30 AM + * + * The Broad Institute + * SOFTWARE COPYRIGHT NOTICE AGREEMENT + * This software and its documentation are copyright 2009 by the + * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. + * + * This software is supplied without any warranty or guaranteed support whatsoever. Neither + * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. + * + */ + + +/** + * @author aaron + * @version 1.0 + * @date Apr 14, 2009 + *

+ * Class BaseTest + *

+ * This is the base test class for all of our test cases. All test cases should extend from this + * class; it sets up the logger, and resolves the location of directories that we rely on. + */ +@SuppressWarnings("unchecked") +public abstract class BaseTest { + /** our log, which we want to capture anything from org.broadinstitute.sting */ + public static final Logger logger = CommandLineUtils.getStingLogger(); + + private static final String CURRENT_DIRECTORY = System.getProperty("user.dir"); + public static final String gatkDirectory = System.getProperty("gatkdir", CURRENT_DIRECTORY) + "/"; + public static final String baseDirectory = System.getProperty("basedir", CURRENT_DIRECTORY) + "/"; + public static final String testType = System.getProperty("testType"); // May be null + public static final String testTypeSubDirectory = testType == null ? "" : ("/" + testType); // May be empty + + public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; + public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; + public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; + //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; + public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; + public static final String b37KGReferenceWithDecoy = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/human_g1k_v37_decoy.fasta"; + public static final String hg19RefereneWithChrPrefixInChromosomeNames = "/humgen/gsa-hpprojects/GATK/bundle/current/hg19/ucsc.hg19.fasta"; + public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; + public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; + public static final String evaluationDataLocation = GATKDataLocation + "Evaluation_Data/"; + public static final String comparisonDataLocation = GATKDataLocation + "Comparisons/"; + public static final String annotationDataLocation = GATKDataLocation + "Annotations/"; + + public static final String b37GoodBAM = validationDataLocation + "/CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; + public static final String b37GoodNA12878BAM = validationDataLocation + "/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam"; + public static final String b37_NA12878_OMNI = validationDataLocation + "/NA12878.omni.vcf"; + + public static final String dbsnpDataLocation = GATKDataLocation; + public static final String b36dbSNP129 = dbsnpDataLocation + "dbsnp_129_b36.vcf"; + public static final String b37dbSNP129 = dbsnpDataLocation + "dbsnp_129_b37.vcf"; + public static final String b37dbSNP132 = dbsnpDataLocation + "dbsnp_132_b37.vcf"; + public static final String b37dbSNP138 = "/humgen/gsa-hpprojects/GATK/bundle/current/b37/dbsnp_138.b37.vcf"; + public static final String hg18dbSNP132 = dbsnpDataLocation + "dbsnp_132.hg18.vcf"; + + public static final String hapmapDataLocation = comparisonDataLocation + "Validated/HapMap/3.3/"; + public static final String b37hapmapGenotypes = hapmapDataLocation + "genotypes_r27_nr.b37_fwd.vcf"; + + public static final String intervalsLocation = "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/"; + public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; + public static final String hg19Chr20Intervals = GATKDataLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; + + public static final boolean REQUIRE_NETWORK_CONNECTION = false; + private static final String networkTempDirRoot = "/broad/hptmp/"; + private static final boolean networkTempDirRootExists = new File(networkTempDirRoot).exists(); + private static final File networkTempDirFile; + + private static final String privateTestDirRelative = "private/gatk-tools-private/src/test/resources/"; + public static final String privateTestDir = new File(gatkDirectory, privateTestDirRelative).getAbsolutePath() + "/"; + protected static final String privateTestDirRoot = privateTestDir.replace(privateTestDirRelative, ""); + + private static final String publicTestDirRelative = "public/gatk-utils/src/test/resources/"; + public static final String publicTestDir = new File(gatkDirectory, publicTestDirRelative).getAbsolutePath() + "/"; + protected static final String publicTestDirRoot = publicTestDir.replace(publicTestDirRelative, ""); + + public static final String keysDataLocation = validationDataLocation + "keys/"; + + public static final String exampleFASTA = publicTestDir + "exampleFASTA.fasta"; + + public final static String NA12878_PCRFREE = privateTestDir + "PCRFree.2x250.Illumina.20_10_11.bam"; + public final static String NA12878_WEx = privateTestDir + "CEUTrio.HiSeq.WEx.b37_decoy.NA12878.20_10_11mb.bam"; + + public static final boolean queueTestRunModeIsSet = System.getProperty("queuetest.run", "").equals("true"); + + /** before the class starts up */ + static { + // setup a basic log configuration + CommandLineUtils.configureConsoleLogging(); + + // setup our log layout + PatternLayout layout = new PatternLayout(); + layout.setConversionPattern("TEST %C{1}.%M - %d{HH:mm:ss,SSS} - %m%n"); + + // now set the layout of all the loggers to our layout + CommandLineUtils.setLayout(logger, layout); + + // Set the Root logger to only output warnings. + logger.setLevel(Level.WARN); + + if (networkTempDirRootExists) { + networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File(networkTempDirRoot + System.getProperty("user.name"))); + networkTempDirFile.deleteOnExit(); + } else { + networkTempDirFile = null; + } + + + if ( REQUIRE_NETWORK_CONNECTION ) { + // find our file sources + if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { + logger.fatal("We can't locate the reference directories. Aborting!"); + throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); + } + } + } + + /** + * Simple generic utility class to creating TestNG data providers: + * + * 1: inherit this class, as in + * + * private class SummarizeDifferenceTest extends TestDataProvider { + * public SummarizeDifferenceTest() { + * super(SummarizeDifferenceTest.class); + * } + * ... + * } + * + * Provide a reference to your class to the TestDataProvider constructor. + * + * 2: Create instances of your subclass. Return from it the call to getTests, providing + * the class type of your test + * + * + * {@literal @}DataProvider(name = "summaries") + * public Object[][] createSummaries() { + * new SummarizeDifferenceTest().addDiff("A", "A").addSummary("A:2"); + * new SummarizeDifferenceTest().addDiff("A", "B").addSummary("A:1", "B:1"); + * return SummarizeDifferenceTest.getTests(SummarizeDifferenceTest.class); + * } + * + * + * This class magically tracks created objects of this + */ + public static class TestDataProvider { + private static final Map> tests = new HashMap<>(); + protected String name; + + /** + * Create a new TestDataProvider instance bound to the class variable C + */ + public TestDataProvider(Class c, String name) { + if ( ! tests.containsKey(c) ) + tests.put(c, new ArrayList<>()); + tests.get(c).add(this); + this.name = name; + } + + public TestDataProvider(Class c) { + this(c, ""); + } + + public void setName(final String name) { + this.name = name; + } + + /** + * Return all of the data providers in the form expected by TestNG of type class C + * @param c + * @return + */ + public static Object[][] getTests(Class c) { + List params2 = new ArrayList(); + for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); + return params2.toArray(new Object[][]{}); + } + + @Override + public String toString() { + return "TestDataProvider("+name+")"; + } + } + + /** + * test if the file exists + * + * @param file name as a string + * @return true if it exists + */ + public static boolean fileExist(String file) { + File temp = new File(file); + return temp.exists(); + } + + /** + * this appender looks for a specific message in the log4j stream. + * It can be used to verify that a specific message was generated to the logging system. + */ + public static class ValidationAppender extends AppenderSkeleton { + + private boolean foundString = false; + private String targetString = ""; + + public ValidationAppender(String target) { + targetString = target; + } + + @Override + protected void append(LoggingEvent loggingEvent) { + if (loggingEvent.getMessage().equals(targetString)) + foundString = true; + } + + public void close() { + // do nothing + } + + public boolean requiresLayout() { + return false; + } + + public boolean foundString() { + return foundString; + } + } + + /** + * Creates a temp file that will be deleted on exit after tests are complete. + * @param name Prefix of the file. + * @param extension Extension to concat to the end of the file. + * @return A file in the temporary directory starting with name, ending with extension, which will be deleted after the program exits. + */ + public static File createTempFile(final String name, final String extension) { + try { + final File file = File.createTempFile(name, extension); + file.deleteOnExit(); + + // Mark corresponding indices for deletion on exit as well just in case an index is created for the temp file: + new File(file.getAbsolutePath() + Tribble.STANDARD_INDEX_EXTENSION).deleteOnExit(); + new File(file.getAbsolutePath() + TabixUtils.STANDARD_INDEX_EXTENSION).deleteOnExit(); + new File(file.getAbsolutePath() + ".bai").deleteOnExit(); + new File(file.getAbsolutePath().replaceAll(extension + "$", ".bai")).deleteOnExit(); + + return file; + } catch (IOException ex) { + throw new ReviewedGATKException("Cannot create temp file: " + ex.getMessage(), ex); + } + } + + /** + * Creates a temp list file that will be deleted on exit after tests are complete. + * @param tempFilePrefix Prefix of the file. + * @param lines lines to write to the file. + * @return A list file in the temporary directory starting with tempFilePrefix, which will be deleted after the program exits. + */ + public static File createTempListFile(final String tempFilePrefix, final String... lines) { + try { + final File tempListFile = createTempFile(tempFilePrefix, ".list"); + + final PrintWriter out = new PrintWriter(tempListFile); + for (final String line : lines) { + out.println(line); + } + out.close(); + + return tempListFile; + } catch (IOException ex) { + throw new ReviewedGATKException("Cannot create temp file: " + ex.getMessage(), ex); + } + } + + /** + * Creates a temp file that will be deleted on exit after tests are complete. + * @param name Name of the file. + * @return A file in the network temporary directory with name, which will be deleted after the program exits. + * @throws SkipException when the network is not available. + */ + public static File tryCreateNetworkTempFile(String name) { + if (!networkTempDirRootExists) + throw new SkipException("Network temporary directory does not exist: " + networkTempDirRoot); + File file = new File(networkTempDirFile, name); + file.deleteOnExit(); + return file; + } + + /** + * Log this message so that it shows up inline during output as well as in html reports + * + * @param message + */ + public static void log(final String message) { + Reporter.log(message, true); + } + + private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; + + public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { + Assert.assertTrue(actual instanceof Double, "Not a double"); + assertEqualsDoubleSmart((double)(Double)actual, (double)expected); + } + + public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { + Assert.assertTrue(actual instanceof Double, "Not a double"); + assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); + } + + public static final void assertEqualsDoubleSmart(final double actual, final double expected) { + assertEqualsDoubleSmart(actual, expected, DEFAULT_FLOAT_TOLERANCE); + } + + public static final void assertEqualsSet(final Set actual, final Set expected, final String info) { + final Set actualSet = new HashSet(actual); + final Set expectedSet = new HashSet(expected); + Assert.assertTrue(actualSet.equals(expectedSet), info); // note this is necessary due to testng bug for set comps + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { + assertEqualsDoubleSmart(actual, expected, tolerance, null); + } + + public static void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance, final String message) { + if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately + Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); + else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately + Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); + else { + final double delta = Math.abs(actual - expected); + final double ratio = Math.abs(actual / expected - 1.0); + Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + + " not within tolerance " + tolerance + + (message == null ? "" : "message: " + message)); + } + } + + public static void assertVariantContextsAreEqual( final VariantContext actual, final VariantContext expected ) { + Assert.assertNotNull(actual, "VariantContext expected not null"); + Assert.assertEquals(actual.getChr(), expected.getChr(), "chr"); + Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); + Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); + Assert.assertEquals(actual.getID(), expected.getID(), "id"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); + + assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); + Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "isFiltered"); + assertEqualsSet(actual.getFilters(), expected.getFilters(), "filters"); + assertEqualsDoubleSmart(actual.getPhredScaledQual(), expected.getPhredScaledQual()); + + Assert.assertEquals(actual.hasGenotypes(), expected.hasGenotypes(), "hasGenotypes"); + if ( expected.hasGenotypes() ) { + assertEqualsSet(actual.getSampleNames(), expected.getSampleNames(), "sample names set"); + Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "sample names"); + final Set samples = expected.getSampleNames(); + for ( final String sample : samples ) { + assertGenotypesAreEqual(actual.getGenotype(sample), expected.getGenotype(sample)); + } + } + } + + public static void assertVariantContextStreamsAreEqual(final Iterable actual, final Iterable expected) { + final Iterator actualIT = actual.iterator(); + final Iterator expectedIT = expected.iterator(); + + while ( expectedIT.hasNext() ) { + final VariantContext expectedVC = expectedIT.next(); + if ( expectedVC == null ) + continue; + + VariantContext actualVC; + do { + Assert.assertTrue(actualIT.hasNext(), "Too few records found in actual"); + actualVC = actualIT.next(); + } while ( actualIT.hasNext() && actualVC == null ); + + if ( actualVC == null ) + Assert.fail("Too few records in actual"); + + assertVariantContextsAreEqual(actualVC, expectedVC); + } + Assert.assertTrue(! actualIT.hasNext(), "Too many records found in actual"); + } + + + public static void assertGenotypesAreEqual(final Genotype actual, final Genotype expected) { + Assert.assertEquals(actual.getSampleName(), expected.getSampleName(), "Genotype names"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "Genotype alleles"); + Assert.assertEquals(actual.getGenotypeString(), expected.getGenotypeString(), "Genotype string"); + Assert.assertEquals(actual.getType(), expected.getType(), "Genotype type"); + + // filters are the same + Assert.assertEquals(actual.getFilters(), expected.getFilters(), "Genotype fields"); + Assert.assertEquals(actual.isFiltered(), expected.isFiltered(), "Genotype isFiltered"); + + // inline attributes + Assert.assertEquals(actual.getDP(), expected.getDP(), "Genotype dp"); + Assert.assertTrue(Arrays.equals(actual.getAD(), expected.getAD())); + Assert.assertEquals(actual.getGQ(), expected.getGQ(), "Genotype gq"); + Assert.assertEquals(actual.hasPL(), expected.hasPL(), "Genotype hasPL"); + Assert.assertEquals(actual.hasAD(), expected.hasAD(), "Genotype hasAD"); + Assert.assertEquals(actual.hasGQ(), expected.hasGQ(), "Genotype hasGQ"); + Assert.assertEquals(actual.hasDP(), expected.hasDP(), "Genotype hasDP"); + + Assert.assertEquals(actual.hasLikelihoods(), expected.hasLikelihoods(), "Genotype haslikelihoods"); + Assert.assertEquals(actual.getLikelihoodsString(), expected.getLikelihoodsString(), "Genotype getlikelihoodsString"); + Assert.assertEquals(actual.getLikelihoods(), expected.getLikelihoods(), "Genotype getLikelihoods"); + Assert.assertTrue(Arrays.equals(actual.getPL(), expected.getPL())); + + Assert.assertEquals(actual.getPhredScaledQual(), expected.getPhredScaledQual(), "Genotype phredScaledQual"); + assertAttributesEquals(actual.getExtendedAttributes(), expected.getExtendedAttributes()); + Assert.assertEquals(actual.isPhased(), expected.isPhased(), "Genotype isPhased"); + Assert.assertEquals(actual.getPloidy(), expected.getPloidy(), "Genotype getPloidy"); + } + + public static void assertVCFHeadersAreEqual(final VCFHeader actual, final VCFHeader expected) { + Assert.assertEquals(actual.getMetaDataInSortedOrder().size(), expected.getMetaDataInSortedOrder().size(), "No VCF header lines"); + + // for some reason set.equals() is returning false but all paired elements are .equals(). Perhaps compare to is busted? + //Assert.assertEquals(actual.getMetaDataInInputOrder(), expected.getMetaDataInInputOrder()); + final List actualLines = new ArrayList(actual.getMetaDataInSortedOrder()); + final List expectedLines = new ArrayList(expected.getMetaDataInSortedOrder()); + for ( int i = 0; i < actualLines.size(); i++ ) { + Assert.assertEquals(actualLines.get(i), expectedLines.get(i), "VCF header lines"); + } + } + + public static void assertVCFandBCFFilesAreTheSame(final File vcfFile, final File bcfFile) throws IOException { + final Pair> vcfData = VCIterable.readAllVCs(vcfFile, new VCFCodec()); + final Pair> bcfData = VCIterable.readAllVCs(bcfFile, new BCF2Codec()); + assertVCFHeadersAreEqual(bcfData.getFirst(), vcfData.getFirst()); + assertVariantContextStreamsAreEqual(bcfData.getSecond(), vcfData.getSecond()); + } + + private static void assertAttributeEquals(final String key, final Object actual, final Object expected) { + if ( expected instanceof Double ) { + // must be very tolerant because doubles are being rounded to 2 sig figs + assertEqualsDoubleSmart(actual, (Double) expected, 1e-2); + } else + Assert.assertEquals(actual, expected, "Attribute " + key); + } + + private static void assertAttributesEquals(final Map actual, Map expected) { + final Set expectedKeys = new HashSet(expected.keySet()); + + for ( final Map.Entry act : actual.entrySet() ) { + final Object actualValue = act.getValue(); + if ( expected.containsKey(act.getKey()) && expected.get(act.getKey()) != null ) { + final Object expectedValue = expected.get(act.getKey()); + if ( expectedValue instanceof List ) { + final List expectedList = (List)expectedValue; + Assert.assertTrue(actualValue instanceof List, act.getKey() + " should be a list but isn't"); + final List actualList = (List)actualValue; + Assert.assertEquals(actualList.size(), expectedList.size(), act.getKey() + " size"); + for ( int i = 0; i < expectedList.size(); i++ ) + assertAttributeEquals(act.getKey(), actualList.get(i), expectedList.get(i)); + } else + assertAttributeEquals(act.getKey(), actualValue, expectedValue); + } else { + // it's ok to have a binding in x -> null that's absent in y + Assert.assertNull(actualValue, act.getKey() + " present in one but not in the other"); + } + expectedKeys.remove(act.getKey()); + } + + // now expectedKeys contains only the keys found in expected but not in actual, + // and they must all be null + for ( final String missingExpected : expectedKeys ) { + final Object value = expected.get(missingExpected); + Assert.assertTrue(isMissing(value), "Attribute " + missingExpected + " missing in one but not in other" ); + } + } + + private static final boolean isMissing(final Object value) { + if ( value == null ) return true; + else if ( value.equals(VCFConstants.MISSING_VALUE_v4) ) return true; + else if ( value instanceof List ) { + // handles the case where all elements are null or the list is empty + for ( final Object elt : (List)value) + if ( elt != null ) + return false; + return true; + } else + return false; + } + + /** + * Checks whether two double array contain the same values or not. + * @param actual actual produced array. + * @param expected expected array. + * @param tolerance maximum difference between double value to be consider equivalent. + */ + protected static void assertEqualsDoubleArray(final double[] actual, final double[] expected, final double tolerance) { + if (expected == null) + Assert.assertNull(actual); + else { + Assert.assertNotNull(actual); + Assert.assertEquals(actual.length,expected.length,"array length"); + } + for (int i = 0; i < actual.length; i++) + Assert.assertEquals(actual[i],expected[i],tolerance,"array position " + i); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseUtilsUnitTest.java new file mode 100644 index 000000000..83dfc8cd2 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BaseUtilsUnitTest.java @@ -0,0 +1,177 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Random; + + +public class BaseUtilsUnitTest extends BaseTest { + @BeforeClass + public void init() { } + + @Test + public void testMostFrequentBaseFraction() { + logger.warn("Executing testMostFrequentBaseFraction"); + + compareFrequentBaseFractionToExpected("AAAAA", 1.0); + compareFrequentBaseFractionToExpected("ACCG", 0.5); + compareFrequentBaseFractionToExpected("ACCCCTTTTG", 4.0/10.0); + } + + private void compareFrequentBaseFractionToExpected(String sequence, double expected) { + double fraction = BaseUtils.mostFrequentBaseFraction(sequence.getBytes()); + Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0); + } + + @Test + public void testConvertIUPACtoN() { + + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'A'}, false, false), new byte[]{'A', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'W', 'A', 'A'}, false, false), new byte[]{'N', 'A', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'M', 'A'}, false, false), new byte[]{'A', 'N', 'A'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'A', 'A', 'K'}, false, false), new byte[]{'A', 'A', 'N'}); + checkBytesAreEqual(BaseUtils.convertIUPACtoN(new byte[]{'M', 'M', 'M'}, false, false), new byte[]{'N', 'N', 'N'}); + } + + private void checkBytesAreEqual(final byte[] b1, final byte[] b2) { + for ( int i = 0; i < b1.length; i++ ) + Assert.assertEquals(b1[i], b2[i]); + } + + @Test + public void testConvertBasesToIUPAC() { + + for ( final BaseUtils.Base b : BaseUtils.Base.values() ) { + if ( BaseUtils.isRegularBase(b.base) ) + Assert.assertEquals(BaseUtils.basesToIUPAC(b.base, b.base), b.base, "testing same base"); + } + + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'X'), 'N', "testing non-standard base"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'A'), 'N', "testing non-standard base"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'X', (byte)'X'), 'N', "testing non-standard base"); + + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'A', (byte)'T'), 'W', "testing A/T=W"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte)'T', (byte)'A'), 'W', "testing T/A=W"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'G', (byte) 'T'), 'K', "testing G/T=K"); + Assert.assertEquals(BaseUtils.basesToIUPAC((byte) 'T', (byte) 'G'), 'K', "testing T/G=K"); + } + + @Test + public void testTransitionTransversion() { + logger.warn("Executing testTransitionTransversion"); + + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'G' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'C', (byte)'A' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'C', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'C', (byte)'G' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'T', (byte)'A' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'T', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'T', (byte)'G' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'G', (byte)'A' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'G', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'G', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'t' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'c' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'t' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'a', (byte)'c' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); + } + + @Test + public void testReverseComplementString() { + logger.warn("Executing testReverseComplementString"); + + compareRCStringToExpected("ACGGT", "ACCGT"); + compareRCStringToExpected("TCGTATATCTCGCTATATATATATAGCTCTAGTATA", "TATACTAGAGCTATATATATATAGCGAGATATACGA"); + compareRCStringToExpected("AAAN", "NTTT"); + } + + private void compareRCStringToExpected(String fw, String rcExp) { + String rcObs = BaseUtils.simpleReverseComplement(fw); + + Assert.assertTrue(rcObs.equals(rcExp)); + } + + @Test(dataProvider="baseComparatorData") + public void testBaseComparator(final Collection basesToSort) { + final ArrayList sorted = new ArrayList<>(basesToSort); + Collections.sort(sorted, BaseUtils.BASES_COMPARATOR); + for (int i = 0; i < sorted.size(); i++) { + Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(i)),0); + final String iString = new String(sorted.get(i)); + for (int j = i; j < sorted.size(); j++) { + final String jString = new String(sorted.get(j)); + if (iString.compareTo(jString) == 0) + Assert.assertEquals(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)),0); + else + Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) * iString.compareTo(jString) > 0); + Assert.assertTrue(BaseUtils.BASES_COMPARATOR.compare(sorted.get(i),sorted.get(j)) <= 0); + } + } + } + + @DataProvider(name="baseComparatorData") + public Object[][] baseComparatorData() { + final int testCount = 10; + final int testSizeAverage = 10; + final int testSizeDeviation = 10; + final int haplotypeSizeAverage = 100; + final int haplotypeSizeDeviation = 100; + + final Object[][] result = new Object[testCount][]; + + Utils.resetRandomGenerator(); + final Random rnd = Utils.getRandomGenerator(); + + for (int i = 0; i < testCount; i++) { + final int size = (int) Math.max(0,rnd.nextDouble() * testSizeDeviation + testSizeAverage); + final ArrayList bases = new ArrayList<>(size); + for (int j = 0; j < size; j++) { + final int jSize = (int) Math.max(0,rnd.nextDouble() * haplotypeSizeDeviation + haplotypeSizeAverage); + final byte[] b = new byte[jSize]; + for (int k = 0; k < jSize; k++) + b[k] = BaseUtils.baseIndexToSimpleBase(rnd.nextInt(4)); + bases.add(b); + } + result[i] = new Object[] { bases }; + } + return result; + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BitSetUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BitSetUtilsUnitTest.java new file mode 100644 index 000000000..5a0bc0bbf --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/BitSetUtilsUnitTest.java @@ -0,0 +1,84 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Random; + +/** + * @author Mauricio Carneiro + * @since 3/5/12 + */ + +public class BitSetUtilsUnitTest { + private static int RANDOM_NUMBERS_TO_TRY = 87380; + private static Random random; + + @BeforeClass + public void init() { + random = Utils.getRandomGenerator(); + } + + @Test(enabled = true) + public void testLongBitSet() { + long[] numbers = {0L, 1L, 428L, 65536L, 239847L, 4611686018427387903L, Long.MAX_VALUE, Long.MIN_VALUE, -1L, -2L, -7L, -128L, -65536L, -100000L}; + for (long n : numbers) + Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); + + for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) { + long n = random.nextLong(); + Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(n)), n); // Because class Random uses a seed with only 48 bits, this algorithm will not return all possible long values. + } + } + + @Test(enabled = true) + public void testShortBitSet() { + short[] numbers = {0, 1, 428, 25934, 23847, 16168, Short.MAX_VALUE, Short.MIN_VALUE, -1, -2, -7, -128, -12312, -31432}; + for (long n : numbers) + Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n); + + for (int i = 0; i < RANDOM_NUMBERS_TO_TRY; i++) { + short n = (short) random.nextInt(); + Assert.assertEquals(BitSetUtils.shortFrom(BitSetUtils.bitSetFrom(n)), n); + } + } + + @Test(enabled = false) + public void testDNAAndBitSetConversion() { + String[] dna = {"AGGTGTTGT", "CCCCCCCCCCCCCC", "GGGGGGGGGGGGGG", "TTTTTTTTTTTTTT", "GTAGACCGATCTCAGCTAGT", "AACGTCAATGCAGTCAAGTCAGACGTGGGTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"}; + + // Test all contexts of size 1-8. + //for (long n = 0; n < RANDOM_NUMBERS_TO_TRY; n++) + // Assert.assertEquals(BitSetUtils.longFrom(BitSetUtils.bitSetFrom(ContextCovariate.contextFromKey(BitSetUtils.bitSetFrom(n)))), n); + + // Test the special cases listed in the dna array + //for (String d : dna) + // Assert.assertEquals(BitSetUtils.dnaFrom(BitSetUtils.bitSetFrom(d)), d); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/ExampleToCopyUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GATKTextReporter.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GATKTextReporter.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GATKTextReporter.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GATKTextReporter.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserBenchmark.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocParserUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocSortedSetUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/GenomeLocUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/GenomeLocUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MD5DB.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MD5DB.java new file mode 100644 index 000000000..b753bc2fa --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MD5DB.java @@ -0,0 +1,312 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import org.apache.commons.io.FileUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.diffengine.DiffEngine; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; + +import java.io.*; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: 7/18/11 + * Time: 9:10 AM + * + * Utilities for manipulating the MD5 database of previous results + */ +public class MD5DB { + public static final Logger logger = Logger.getLogger(MD5DB.class); + + /** + * Subdirectory under the ant build directory where we store integration test md5 results + */ + private static final int MAX_RECORDS_TO_READ = 1000000; + private static final int MAX_RAW_DIFFS_TO_SUMMARIZE = -1; + public static final String LOCAL_MD5_DB_DIR = "integrationtests"; + public static final String GLOBAL_MD5_DB_DIR = "/humgen/gsa-hpprojects/GATK/data/integrationtests"; + + // tracking and emitting a data file of origina and new md5s + private final File MD5MismatchesFile; + private final PrintStream md5MismatchStream; + + public MD5DB() { + this(new File(MD5DB.LOCAL_MD5_DB_DIR + "/md5mismatches.txt")); + } + + public MD5DB(final File MD5MismatchesFile) { + this.MD5MismatchesFile = MD5MismatchesFile; + + ensureMd5DbDirectory(); + + logger.debug("Creating md5 mismatch db at " + MD5MismatchesFile); + try { + md5MismatchStream = new PrintStream(new FileOutputStream(MD5MismatchesFile)); + md5MismatchStream.printf("%s\t%s\t%s%n", "expected", "observed", "test"); + } catch ( FileNotFoundException e ) { + throw new ReviewedGATKException("Failed to open md5 mismatch file", e); + } + + } + + public void close() { + if ( md5MismatchStream != null ) { + logger.debug("Closeing md5 mismatch db at " + MD5MismatchesFile); + md5MismatchStream.close(); + } + } + + // ---------------------------------------------------------------------- + // + // MD5 DB stuff + // + // ---------------------------------------------------------------------- + + /** + * Create the MD5 file directories if necessary + */ + private void ensureMd5DbDirectory() { + File dir = new File(LOCAL_MD5_DB_DIR); + if ( ! dir.exists() ) { + System.out.printf("##### Creating MD5 db %s%n", LOCAL_MD5_DB_DIR); + if ( ! dir.mkdir() ) { + // Need to check AGAIN whether the dir exists, because we might be doing multi-process parallelism + // within the same working directory, and another GATK instance may have come along and created the + // directory between the calls to exists() and mkdir() above. + if ( ! dir.exists() ) { + throw new ReviewedGATKException("Infrastructure failure: failed to create md5 directory " + LOCAL_MD5_DB_DIR); + } + } + } + } + + /** + * Returns the path to an already existing file with the md5 contents, or valueIfNotFound + * if no such file exists in the db. + * + * @param md5 + * @param valueIfNotFound + * @return + */ + public String getMD5FilePath(final String md5, final String valueIfNotFound) { + // we prefer the global db to the local DB, so match it first + for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) { + File f = getFileForMD5(md5, dir); + if ( f.exists() && f.canRead() ) + return f.getAbsolutePath(); + } + + return valueIfNotFound; + } + + /** + * Utility function that given a file's md5 value and the path to the md5 db, + * returns the canonical name of the file. For example, if md5 is XXX and db is YYY, + * this will return YYY/XXX.integrationtest + * + * @param md5 + * @param dbPath + * @return + */ + private File getFileForMD5(final String md5, final String dbPath) { + final String basename = String.format("%s.integrationtest", md5); + return new File(dbPath + "/" + basename); + } + + /** + * Copies the results file with md5 value to its canonical file name and db places + * + * @param md5 + * @param resultsFile + */ + private void updateMD5Db(final String md5, final File resultsFile) { + copyFileToDB(getFileForMD5(md5, LOCAL_MD5_DB_DIR), resultsFile); + copyFileToDB(getFileForMD5(md5, GLOBAL_MD5_DB_DIR), resultsFile); + } + + /** + * Low-level utility routine that copies resultsFile to dbFile + * @param dbFile + * @param resultsFile + */ + private void copyFileToDB(File dbFile, final File resultsFile) { + if ( ! dbFile.exists() ) { + // the file isn't already in the db, copy it over + System.out.printf("##### Updating MD5 file: %s%n", dbFile.getPath()); + try { + FileUtils.copyFile(resultsFile, dbFile); + } catch ( IOException e ) { + System.out.printf("##### Skipping update, cannot write file %s%n", dbFile); + } + } else { + //System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); + } + } + + /** + * Returns the byte[] of the entire contents of file, for md5 calculations + * @param file + * @return + * @throws IOException + */ + private static byte[] getBytesFromFile(File file) throws IOException { + InputStream is = new FileInputStream(file); + + // Get the size of the file + long length = file.length(); + + if (length > Integer.MAX_VALUE) { + // File is too large + } + + // Create the byte array to hold the data + byte[] bytes = new byte[(int) length]; + + // Read in the bytes + int offset = 0; + int numRead = 0; + while (offset < bytes.length + && (numRead = is.read(bytes, offset, bytes.length - offset)) >= 0) { + offset += numRead; + } + + // Ensure all the bytes have been read in + if (offset < bytes.length) { + throw new IOException("Could not completely read file " + file.getName()); + } + + // Close the input stream and return bytes + is.close(); + return bytes; + } + + public static class MD5Match { + public final String actualMD5, expectedMD5; + public final String failMessage; + public final String diffEngineOutput; + public final boolean failed; + + public MD5Match(final String actualMD5, final String expectedMD5, final String failMessage, final String diffEngineOutput, final boolean failed) { + this.actualMD5 = actualMD5; + this.expectedMD5 = expectedMD5; + this.failMessage = failMessage; + this.diffEngineOutput = diffEngineOutput; + this.failed = failed; + } + } + + /** + * Tests a file MD5 against an expected value, returning an MD5Match object containing a description of the + * match or mismatch. In case of a mismatch, outputs a description of the mismatch to various log files/streams. + * + * NOTE: This function WILL NOT throw an exception if the MD5s are different. + * + * @param testName Name of the test. + * @param testClassName Name of the class that contains the test. + * @param resultsFile File to MD5. + * @param expectedMD5 Expected MD5 value. + * @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text. + * @return an MD5Match object containing a description of the match/mismatch. Will have its "failed" field set + * to true if there was a mismatch (unless we're using the "parameterize" argument) + */ + public MD5Match testFileMD5(final String testName, final String testClassName, final File resultsFile, final String expectedMD5, final boolean parameterize) { + final String actualMD5 = calculateFileMD5(resultsFile); + String diffEngineOutput = ""; + String failMessage = ""; + boolean failed = false; + + // copy md5 to integrationtests + updateMD5Db(actualMD5, resultsFile); + + if (parameterize || expectedMD5.equals("")) { + BaseTest.log(String.format("PARAMETERIZATION: file %s has md5 = %s", resultsFile, actualMD5)); + } else if ( ! expectedMD5.equals(actualMD5) ) { + failed = true; + failMessage = String.format("%s:%s has mismatching MD5s: expected=%s observed=%s", testClassName, testName, expectedMD5, actualMD5); + diffEngineOutput = logMD5MismatchAndGetDiffEngineOutput(testName, testClassName, expectedMD5, actualMD5); + } + + return new MD5Match(actualMD5, expectedMD5, failMessage, diffEngineOutput, failed); + } + + /** + * Calculates the MD5 for the specified file and returns it as a String + * + * @param file file whose MD5 to calculate + * @return file's MD5 in String form + * @throws RuntimeException if the file could not be read + */ + public String calculateFileMD5( final File file ) { + try { + return Utils.calcMD5(getBytesFromFile(file)); + } + catch ( Exception e ) { + throw new RuntimeException("Failed to read bytes from file: " + file + " for MD5 calculation", e); + } + } + + /** + * Logs a description (including diff engine output) of the MD5 mismatch between the expectedMD5 + * and actualMD5 to a combination of BaseTest.log(), the md5MismatchStream, and stdout, then returns + * the diff engine output. + * + * @param testName name of the test that generated the mismatch + * @param testClassName name of the class containing the test that generated the mismatch + * @param expectedMD5 the MD5 we were expecting from this test + * @param actualMD5 the MD5 we actually calculated from the test output + * @return the diff engine output produced while logging the description of the mismatch + */ + private String logMD5MismatchAndGetDiffEngineOutput(final String testName, final String testClassName, final String expectedMD5, final String actualMD5) { + System.out.printf("##### Test %s:%s is going to fail #####%n", testClassName, testName); + String pathToExpectedMD5File = getMD5FilePath(expectedMD5, "[No DB file found]"); + String pathToFileMD5File = getMD5FilePath(actualMD5, "[No DB file found]"); + BaseTest.log(String.format("expected %s", expectedMD5)); + BaseTest.log(String.format("calculated %s", actualMD5)); + BaseTest.log(String.format("diff %s %s", pathToExpectedMD5File, pathToFileMD5File)); + + md5MismatchStream.printf("%s\t%s\t%s%n", expectedMD5, actualMD5, testName); + md5MismatchStream.flush(); + + // inline differences + String diffEngineOutput = ""; + final ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final PrintStream ps = new PrintStream(baos); + DiffEngine.SummaryReportParams params = new DiffEngine.SummaryReportParams(ps, 20, 10, 0, MAX_RAW_DIFFS_TO_SUMMARIZE, false); + boolean success = DiffEngine.simpleDiffFiles(new File(pathToExpectedMD5File), new File(pathToFileMD5File), MAX_RECORDS_TO_READ, params); + if ( success ) { + diffEngineOutput = baos.toString(); + BaseTest.log(diffEngineOutput); + System.out.printf("Note that the above list is not comprehensive. At most 20 lines of output, and 10 specific differences will be listed. Please use -T DiffObjects -R " + BaseTest.publicTestDir + "exampleFASTA.fasta -m %s -t %s to explore the differences more freely%n", + pathToExpectedMD5File, pathToFileMD5File); + } + ps.close(); + + return diffEngineOutput; + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MD5Mismatch.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MD5Mismatch.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MD5Mismatch.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MD5Mismatch.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequencingDictionaryUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequencingDictionaryUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequencingDictionaryUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MRUCachingSAMSequencingDictionaryUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MWUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MathUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MathUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MathUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MathUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MedianUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MedianUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/MedianUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/MedianUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/NGSPlatformUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/PathUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/PathUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/PathUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/PathUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/QualityUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/QualityUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/QualityUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/QualityUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/R/RScriptExecutorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/R/RScriptExecutorUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/R/RScriptExecutorUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/R/RScriptExecutorUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/R/RScriptLibraryUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/R/RScriptLibraryUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/R/RScriptLibraryUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/R/RScriptLibraryUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/R/RUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/R/RUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/R/RUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/R/RUtilsUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtilsUnitTest.java new file mode 100644 index 000000000..c53c01bd5 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/SequenceDictionaryUtilsUnitTest.java @@ -0,0 +1,239 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import static org.broadinstitute.gatk.utils.SequenceDictionaryUtils.*; +import static org.broadinstitute.gatk.utils.SequenceDictionaryUtils.SequenceDictionaryCompatibility.*; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class SequenceDictionaryUtilsUnitTest extends BaseTest { + + private static Logger logger = Logger.getLogger(SequenceDictionaryUtilsUnitTest.class); + + + @DataProvider( name = "SequenceDictionaryDataProvider" ) + public Object[][] generateSequenceDictionaryTestData() { + final SAMSequenceRecord CHRM_HG19 = new SAMSequenceRecord("chrM", 16571); + final SAMSequenceRecord CHR_NONSTANDARD1 = new SAMSequenceRecord("NonStandard1", 8675309); + final SAMSequenceRecord CHR_NONSTANDARD2 = new SAMSequenceRecord("NonStandard2", 8675308); + + final Class NO_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class UNEQUAL_COMMON_CONTIGS_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class NON_CANONICAL_HUMAN_ORDER_EXCEPTION = UserException.LexicographicallySortedSequenceDictionary.class; + final Class OUT_OF_ORDER_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + final Class DIFFERENT_INDICES_EXCEPTION = UserException.IncompatibleSequenceDictionaries.class; + + final List hg19Sequences = Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR10_HG19); + final GenomeLocParser hg19GenomeLocParser = new GenomeLocParser(new SAMSequenceDictionary(hg19Sequences)); + final List hg19AllContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1), + hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1), + hg19GenomeLocParser.createGenomeLoc("chr2", 0, 1), + hg19GenomeLocParser.createGenomeLoc("chr10", 0, 1)); + final List hg19PartialContigsIntervals = Arrays.asList(hg19GenomeLocParser.createGenomeLoc("chrM", 0, 1), + hg19GenomeLocParser.createGenomeLoc("chr1", 0, 1)); + final GenomeLocSortedSet hg19AllContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19AllContigsIntervals); + final GenomeLocSortedSet hg19PartialContigsIntervalSet = new GenomeLocSortedSet(hg19GenomeLocParser, hg19PartialContigsIntervals); + + return new Object[][] { + // Identical dictionaries: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19), null, IDENTICAL, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, IDENTICAL, null, false, null }, + { Arrays.asList(CHR1_B37), Arrays.asList(CHR1_B37), null, IDENTICAL, null, false, null }, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, IDENTICAL, null, false, null }, + + // Dictionaries with a common subset: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG19, CHR2_HG19, CHR10_HG19, CHRM_HG19), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD1), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37, CHR_NONSTANDARD2), null, COMMON_SUBSET, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, COMMON_SUBSET, null, false, null }, + + // Dictionaries with no common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), null, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, + + // Dictionaries with unequal common contigs: + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_B36), Arrays.asList(CHR1_B37), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_B37, CHR2_B37, CHR10_B37), Arrays.asList(CHR1_B36, CHR2_B36, CHR10_B36), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19, CHR_NONSTANDARD1), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18, CHR_NONSTANDARD2), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR_NONSTANDARD2, CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR1_HG18, CHR2_HG18, CHR10_HG18), null, UNEQUAL_COMMON_CONTIGS, UNEQUAL_COMMON_CONTIGS_EXCEPTION, false, null }, + + // One or both dictionaries in non-canonical human order: + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), Arrays.asList(CHR1_HG18, CHR10_HG18, CHR2_HG18), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), Arrays.asList(CHR1_B37, CHR10_B37, CHR2_B37), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), Arrays.asList(CHR1_B36, CHR10_B36, CHR2_B36), null, NON_CANONICAL_HUMAN_ORDER, NON_CANONICAL_HUMAN_ORDER_EXCEPTION, false, null }, + + // Dictionaries with a common subset, but different relative ordering within that subset: + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19, CHRM_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR2_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHRM_HG19, CHR1_HG19), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, + { Arrays.asList(CHR1_B37, CHR2_B37), Arrays.asList(CHR2_B37, CHR1_B37), null, OUT_OF_ORDER, OUT_OF_ORDER_EXCEPTION, false, null }, + + + // Dictionaries with a common subset in the same relative order, but with different indices. + // This will only throw an exception during validation if isReadsToReferenceComparison is true, + // and there are intervals overlapping the misindexed contigs: + + // These have isReadsToReferenceComparison == true and overlapping intervals, so we expect an exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHRM_HG19, CHR_NONSTANDARD1, CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR2_HG19, CHR_NONSTANDARD1, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19, CHR_NONSTANDARD2 ), null, DIFFERENT_INDICES, DIFFERENT_INDICES_EXCEPTION, true, hg19AllContigsIntervalSet }, + + // These have isReadsToReferenceComparison == true but no overlapping intervals, so we don't expect an exception: + { Arrays.asList(CHR2_HG19, CHR10_HG19), Arrays.asList(CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHR10_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHR10_HG19), null, DIFFERENT_INDICES, null, true, hg19PartialContigsIntervalSet }, + + // These have isReadsToReferenceComparison == false, so we don't expect an exception: + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet }, + { Arrays.asList(CHR1_HG19, CHR_NONSTANDARD1, CHR2_HG19, CHRM_HG19), Arrays.asList(CHR1_HG19, CHR2_HG19, CHRM_HG19), null, DIFFERENT_INDICES, null, false, hg19AllContigsIntervalSet }, + + + // Tests for validation exclusions. Note that errors resulting from NO_COMMON_CONTIGs cannot be suppressed + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR2_HG19), ValidationExclusion.TYPE.ALL, NO_COMMON_CONTIGS, NO_COMMON_CONTIGS_EXCEPTION, false, null }, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, UNEQUAL_COMMON_CONTIGS, null, false, null }, + { Arrays.asList(CHR1_HG19), Arrays.asList(CHR1_HG18), ValidationExclusion.TYPE.ALL, UNEQUAL_COMMON_CONTIGS, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, NON_CANONICAL_HUMAN_ORDER, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), Arrays.asList(CHR1_HG19, CHR10_HG19, CHR2_HG19), ValidationExclusion.TYPE.ALL, NON_CANONICAL_HUMAN_ORDER, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, OUT_OF_ORDER, null, false, null }, + { Arrays.asList(CHR1_HG19, CHR2_HG19), Arrays.asList(CHR2_HG19, CHR1_HG19), ValidationExclusion.TYPE.ALL, OUT_OF_ORDER, null, false, null }, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALLOW_SEQ_DICT_INCOMPATIBILITY, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet }, + { Arrays.asList(CHRM_HG19, CHR1_HG19), Arrays.asList(CHR1_HG19), ValidationExclusion.TYPE.ALL, DIFFERENT_INDICES, null, true, hg19AllContigsIntervalSet } + }; + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryValidation( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final ValidationExclusion.TYPE validationExclusions, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, + final Class expectedExceptionUponValidation, + final boolean isReadsToReferenceComparison, + final GenomeLocSortedSet intervals ) { + + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s Validation exclusions: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary), + validationExclusions); + + Exception exceptionThrown = null; + try { + SequenceDictionaryUtils.validateDictionaries(logger, + validationExclusions, + "firstDictionary", + firstDictionary, + "secondDictionary", + secondDictionary, + isReadsToReferenceComparison, + intervals); + } + catch ( Exception e ) { + exceptionThrown = e; + } + + if ( expectedExceptionUponValidation != null ) { + Assert.assertTrue(exceptionThrown != null && expectedExceptionUponValidation.isInstance(exceptionThrown), + String.format("Expected exception %s but saw %s instead. %s", + expectedExceptionUponValidation.getSimpleName(), + exceptionThrown == null ? "no exception" : exceptionThrown.getClass().getSimpleName(), + testDescription)); + } + else { + Assert.assertTrue(exceptionThrown == null, + String.format("Expected no exception but saw exception %s instead. %s", + exceptionThrown != null ? exceptionThrown.getClass().getSimpleName() : "none", + testDescription)); + } + } + + @Test( dataProvider = "SequenceDictionaryDataProvider" ) + public void testSequenceDictionaryComparison( final List firstDictionaryContigs, + final List secondDictionaryContigs, + final ValidationExclusion.TYPE validationExclusions, + final SequenceDictionaryUtils.SequenceDictionaryCompatibility dictionaryCompatibility, + final Class expectedExceptionUponValidation, + final boolean isReadsToReferenceComparison, + final GenomeLocSortedSet intervals ) { + + final SAMSequenceDictionary firstDictionary = createSequenceDictionary(firstDictionaryContigs); + final SAMSequenceDictionary secondDictionary = createSequenceDictionary(secondDictionaryContigs); + final String testDescription = String.format("First dictionary: %s Second dictionary: %s", + SequenceDictionaryUtils.getDictionaryAsString(firstDictionary), + SequenceDictionaryUtils.getDictionaryAsString(secondDictionary)); + + final SequenceDictionaryUtils.SequenceDictionaryCompatibility reportedCompatibility = + SequenceDictionaryUtils.compareDictionaries(firstDictionary, secondDictionary); + + Assert.assertTrue(reportedCompatibility == dictionaryCompatibility, + String.format("Dictionary comparison should have returned %s but instead returned %s. %s", + dictionaryCompatibility, reportedCompatibility, testDescription)); + } + + private SAMSequenceDictionary createSequenceDictionary( final List contigs ) { + final List clonedContigs = new ArrayList(contigs.size()); + + // Clone the individual SAMSequenceRecords to avoid contig-index issues with shared objects + // across multiple dictionaries in tests + for ( SAMSequenceRecord contig : contigs ) { + clonedContigs.add(contig.clone()); + } + + return new SAMSequenceDictionary(clonedContigs); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SimpleTimerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/SimpleTimerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/SimpleTimerUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/SimpleTimerUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/TestNGTestTransformer.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/TestNGTestTransformer.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/TestNGTestTransformer.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/TestNGTestTransformer.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java new file mode 100644 index 000000000..dc3909e65 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/UtilsUnitTest.java @@ -0,0 +1,362 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.gatk.utils.io.IOUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Testing framework for general purpose utilities class. + * + * @author hanna + * @version 0.1 + */ + +public class UtilsUnitTest extends BaseTest { + @Test + public void testAppend() { + for ( int leftSize : Arrays.asList(0, 1, 2, 3) ) { + for ( final int rightSize : Arrays.asList(0, 1, 2) ) { + final List left = new LinkedList(); + for ( int i = 0; i < leftSize; i++ ) left.add(i); + final List total = new LinkedList(); + for ( int i = 0; i < leftSize + rightSize; i++ ) total.add(i); + + if ( rightSize == 0 ) + Assert.assertEquals(Utils.append(left), total); + if ( rightSize == 1 ) + Assert.assertEquals(Utils.append(left, leftSize), total); + if ( rightSize == 2 ) + Assert.assertEquals(Utils.append(left, leftSize, leftSize + 1), total); + } + } + + } + + @Test + public void testDupStringNoChars() { + String duped = Utils.dupString('a',0); + Assert.assertEquals(duped.length(), 0, "dupString did not produce zero-length string"); + } + + @Test + public void testDupStringOneChar() { + String duped = Utils.dupString('b',1); + Assert.assertEquals(duped.length(), 1, "dupString did not produce single character string"); + Assert.assertEquals(duped.charAt(0), 'b', "dupString character was incorrect"); + } + + @Test + public void testXor() { + Assert.assertEquals(Utils.xor(false, false), false, "xor F F failed"); + Assert.assertEquals(Utils.xor(false, true), true, "xor F T failed"); + Assert.assertEquals(Utils.xor(true, false), true, "xor T F failed"); + Assert.assertEquals(Utils.xor(true, true), false, "xor T T failed"); + } + + @Test + public void testDupStringMultiChar() { + String duped = Utils.dupString('c',5); + Assert.assertEquals(duped.length(), 5, "dupString did not produce five character string"); + Assert.assertEquals(duped,"ccccc","dupString string was incorrect"); + } + + @Test + public void testJoinMap() { + Map map = new LinkedHashMap(); + map.put("one",1); + map.put("two",2); + String joined = Utils.joinMap("-",";",map); + Assert.assertTrue("one-1;two-2".equals(joined)); + } + + @Test + public void testJoinMapLargerSet() { + Map map = new LinkedHashMap(); + map.put("one",1); + map.put("two",2); + map.put("three",1); + map.put("four",2); + map.put("five",1); + map.put("six",2); + String joined = Utils.joinMap("-",";",map); + Assert.assertTrue("one-1;two-2;three-1;four-2;five-1;six-2".equals(joined)); + } + + @Test + public void testConcat() { + final String s1 = "A"; + final String s2 = "CC"; + final String s3 = "TTT"; + final String s4 = "GGGG"; + Assert.assertEquals(new String(Utils.concat()), ""); + Assert.assertEquals(new String(Utils.concat(s1.getBytes())), s1); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes())), s1 + s2); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes())), s1 + s2 + s3); + Assert.assertEquals(new String(Utils.concat(s1.getBytes(), s2.getBytes(), s3.getBytes(), s4.getBytes())), s1 + s2 + s3 + s4); + } + + @Test + public void testEscapeExpressions() { + String[] expected, actual; + + expected = new String[] {"one", "two", "three"}; + actual = Utils.escapeExpressions("one two three"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two three"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions("one two three "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two three "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two three "); + Assert.assertEquals(actual, expected); + + expected = new String[] {"one", "two", "three four", "five", "six"}; + actual = Utils.escapeExpressions("one two 'three four' five six"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two 'three four' five six"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions("one two 'three four' five six "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two 'three four' five six "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two 'three four' five six "); + Assert.assertEquals(actual, expected); + + expected = new String[] {"one two", "three", "four"}; + actual = Utils.escapeExpressions("'one two' three four"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" 'one two' three four"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions("'one two' three four "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" 'one two' three four "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" 'one two' three four "); + Assert.assertEquals(actual, expected); + + expected = new String[] {"one", "two", "three four"}; + actual = Utils.escapeExpressions("one two 'three four'"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two 'three four'"); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions("one two 'three four' "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two 'three four' "); + Assert.assertEquals(actual, expected); + actual = Utils.escapeExpressions(" one two 'three four' "); + Assert.assertEquals(actual, expected); + } + + @Test(dataProvider = "asIntegerListData") + public void testAsIntegerList(final int[] values) { + if (values == null) { + try { + Utils.asList((int[]) null); + Assert.fail("Should have thrown an exception"); + } catch (final IllegalArgumentException ex) { + // good. + } + } else { + final Random rdn = Utils.getRandomGenerator(); + final int[] valuesClone = values.clone(); + final List list = Utils.asList(valuesClone); + Assert.assertNotNull(list); + Assert.assertEquals(list.size(),values.length); + for (int i = 0; i < values.length; i++) + Assert.assertEquals((int) list.get(i),values[i]); + for (int i = 0; i < values.length; i++) + valuesClone[rdn.nextInt(values.length)] = rdn.nextInt(1000); + for (int i = 0; i < values.length; i++) + Assert.assertEquals((int) list.get(i),valuesClone[i]); + } + } + + @Test(dataProvider = "asDoubleListData") + public void testAsDoubleList(final double[] values) { + if (values == null) { + try { + Utils.asList((int[]) null); + Assert.fail("Should have thrown an exception"); + } catch (final IllegalArgumentException ex) { + // good. + } + } else { + final Random rdn = Utils.getRandomGenerator(); + final double[] valuesClone = values.clone(); + final List list = Utils.asList(valuesClone); + Assert.assertNotNull(list); + Assert.assertEquals(list.size(),values.length); + for (int i = 0; i < values.length; i++) + Assert.assertEquals((double) list.get(i),values[i]); + for (int i = 0; i < values.length; i++) + valuesClone[rdn.nextInt(values.length)] = rdn.nextDouble() * 1000; + for (int i = 0; i < values.length; i++) + Assert.assertEquals((double) list.get(i),valuesClone[i]); + } + } + + @Test + public void testCalcMD5() throws Exception { + final File source = new File(publicTestDir + "exampleFASTA.fasta"); + final String sourceMD5 = "36880691cf9e4178216f7b52e8d85fbe"; + + final byte[] sourceBytes = IOUtils.readFileIntoByteArray(source); + Assert.assertEquals(Utils.calcMD5(sourceBytes), sourceMD5); + + final String sourceString = FileUtils.readFileToString(source); + Assert.assertEquals(Utils.calcMD5(sourceString), sourceMD5); + } + + @Test + public void testLongestCommonOps() { + for ( int prefixLen = 0; prefixLen < 20; prefixLen++ ) { + for ( int extraSeq1Len = 0; extraSeq1Len < 10; extraSeq1Len++ ) { + for ( int extraSeq2Len = 0; extraSeq2Len < 10; extraSeq2Len++ ) { + for ( int max = 0; max < 50; max++ ) { + final String prefix = Utils.dupString("A", prefixLen); + final int expected = Math.min(prefixLen, max); + + { + final String seq1 = prefix + Utils.dupString("C", extraSeq1Len); + final String seq2 = prefix + Utils.dupString("G", extraSeq1Len); + Assert.assertEquals(Utils.longestCommonPrefix(seq1.getBytes(), seq2.getBytes(), max), expected, "LongestCommonPrefix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max); + } + + { + final String seq1 = Utils.dupString("C", extraSeq1Len) + prefix; + final String seq2 = Utils.dupString("G", extraSeq1Len) + prefix; + Assert.assertEquals(Utils.longestCommonSuffix(seq1.getBytes(), seq2.getBytes(), max), expected, "longestCommonSuffix failed: seq1 " + seq1 + " seq2 " + seq2 + " max " + max); + } + } + } + } + } + } + + @DataProvider(name = "trim") + public Object[][] createTrimTestData() { + List tests = new ArrayList(); + + final String s = "AAAA"; + for ( int front = 0; front < s.length(); front++ ) { + for ( int back = 0; back < s.length(); back++ ) { + if ( front + back <= s.length() ) + tests.add(new Object[]{s, front, back}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "trim", enabled = true) + public void testTrim(final String s, final int frontTrim, final int backTrim) { + Assert.assertEquals(s.length() - frontTrim - backTrim, Utils.trimArray(s.getBytes(), frontTrim, backTrim).length); + } + + @Test(dataProvider = "equalRangeData", enabled = true) + public void testEqualRange(final byte[] array1, final byte[] array2, final int offset1, final int offset2, final int length, final boolean expected) { + Assert.assertEquals(Utils.equalRange(array1,offset1,array2,offset2,length),expected); + Assert.assertTrue(Utils.equalRange(array1,offset1,array1,offset1,length)); + Assert.assertTrue(Utils.equalRange(array2,offset2,array2,offset2,length)); + + } + + @DataProvider(name = "equalRangeData") + public Object[][] equalRangeData() { + return new Object[][] { + new Object[] { new byte[0] , new byte[0], 0, 0, 0, true}, + new Object[] { "ABCF".getBytes(), "BC".getBytes(), 1,0,2, true }, + new Object[] { "ABCF".getBytes(), "".getBytes(), 1,0,0, true }, + new Object[] { "ABCF".getBytes(), "ACBF".getBytes(), 0,0, 4, false} + }; + + } + + @Test(dataProvider = "skimArrayData") + public void testSkimArray(final String original, final String remove) { + final StringBuilder resultBuilder = new StringBuilder(); + final boolean[] removeBoolean = new boolean[remove.length()]; + for (int i = 0; i < original.length(); i++) + if (remove.charAt(i) == '1') { + resultBuilder.append(original.charAt(i)); + removeBoolean[i] = false; + } else + removeBoolean[i] = true; + + final String expected = resultBuilder.toString(); + final byte[] resultBytes = Utils.skimArray(original.getBytes(),removeBoolean); + final String resultString = new String(resultBytes); + Assert.assertEquals(resultString,expected); + } + + @DataProvider(name = "skimArrayData") + public Object[][] skimArrayData() { + return new Object[][] { + {"romeo+juliette" , "11111111111111" }, + {"romeo+juliette" , "11111011111111" }, + {"romeo+juliette" , "00000011111111" }, + {"romeo+juliette" , "11111100000000" }, + {"romeo+juliette" , "11111011111111" }, + {"romeo+juliette" , "01111010000001" }, + {"romeo+juliette" , "01100110000110" }, + {"romeo+juliette" , "10101010101010" }, + {"romeo+juliette" , "01010101010101" }, + {"romeo+juliette" , "01111010111001" }, + }; + } + + + @DataProvider(name = "asIntegerListData") + public Object[][] asIntegerListData() { + return new Object[][] { + { null }, + {new int[0]}, + {new int[]{1, 2, 3, 4, 5}}, + {new int[]{2}}, + {new int[]{3,4}} + }; + } + + @DataProvider(name = "asDoubleListData") + public Object[][] asDoubleListData() { + return new Object[][] { + { null }, + {new double[0]}, + {new double[]{1, 2, 3, 4, 5}}, + {new double[]{2}}, + {new double[]{3,4}}, + {new double[]{Double.NaN, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY}} + }; + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActiveRegionUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileStateUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileStateUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileStateUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileStateUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/ActivityProfileUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java new file mode 100644 index 000000000..f0666aca2 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/activeregion/BandPassActivityProfileUnitTest.java @@ -0,0 +1,339 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.activeregion; + + +// the imports for unit testing. + + +import htsjdk.samtools.reference.ReferenceSequenceFile; +import org.apache.commons.lang.ArrayUtils; +import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.gatk.utils.variant.VCIterable; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFCodec; +import htsjdk.variant.vcf.VCFHeader; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + + +public class BandPassActivityProfileUnitTest extends BaseTest { + private final static boolean DEBUG = false; + private GenomeLocParser genomeLocParser; + + private final static int MAX_PROB_PROPAGATION_DISTANCE = 50; + private final static double ACTIVE_PROB_THRESHOLD= 0.002; + + @BeforeClass + public void init() throws FileNotFoundException { + // sequence + ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(seq); + } + + @DataProvider(name = "BandPassBasicTest") + public Object[][] makeBandPassTest() { + final List tests = new LinkedList(); + + for ( int start : Arrays.asList(1, 10, 100, 1000) ) { + for ( boolean precedingIsActive : Arrays.asList(true, false) ) { + for ( int precedingSites: Arrays.asList(0, 1, 10, 100) ) { + for ( int bandPassSize : Arrays.asList(0, 1, 10, 100) ) { + for ( double sigma : Arrays.asList(1.0, 2.0, BandPassActivityProfile.DEFAULT_SIGMA) ) { +// for ( int start : Arrays.asList(10) ) { +// for ( boolean precedingIsActive : Arrays.asList(false) ) { +// for ( int precedingSites: Arrays.asList(0) ) { +// for ( int bandPassSize : Arrays.asList(1) ) { + tests.add(new Object[]{ start, precedingIsActive, precedingSites, bandPassSize, sigma }); + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = ! DEBUG, dataProvider = "BandPassBasicTest") + public void testBandPass(final int start, final boolean precedingIsActive, final int nPrecedingSites, final int bandPassSize, final double sigma) { + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, bandPassSize, sigma, false); + + final int expectedBandSize = bandPassSize * 2 + 1; + Assert.assertEquals(profile.getFilteredSize(), bandPassSize, "Wrong filter size"); + Assert.assertEquals(profile.getSigma(), sigma, "Wrong sigma"); + Assert.assertEquals(profile.getBandSize(), expectedBandSize, "Wrong expected band size"); + + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + final double precedingProb = precedingIsActive ? 1.0 : 0.0; + for ( int i = 0; i < nPrecedingSites; i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, i + start); + final ActivityProfileState state = new ActivityProfileState(loc, precedingProb); + profile.add(state); + } + + final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, nPrecedingSites + start); + profile.add(new ActivityProfileState(nextLoc, 1.0)); + + if ( precedingIsActive == false && nPrecedingSites >= bandPassSize && bandPassSize < start ) { + // we have enough space that all probs fall on the genome + final double[] probs = profile.getProbabilitiesAsArray(); + Assert.assertEquals(MathUtils.sum(probs), 1.0 * (nPrecedingSites * precedingProb + 1), 1e-3, "Activity profile doesn't sum to number of non-zero prob states"); + } + } + + private double[] bandPassInOnePass(final BandPassActivityProfile profile, final double[] activeProbArray) { + final double[] bandPassProbArray = new double[activeProbArray.length]; + + // apply the band pass filter for activeProbArray into filteredProbArray + final double[] GaussianKernel = profile.getKernel(); + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(profile.getFilteredSize() - iii, 0), Math.min(GaussianKernel.length, profile.getFilteredSize() + activeProbArray.length - iii)); + final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - profile.getFilteredSize()), Math.min(activeProbArray.length,iii + profile.getFilteredSize() + 1)); + bandPassProbArray[iii] = dotProduct(activeProbSubArray, kernel); + } + + return bandPassProbArray; + } + + public static double dotProduct(double[] v1, double[] v2) { + Assert.assertEquals(v1.length,v2.length,"Array lengths do not mach in dotProduct"); + double result = 0.0; + for (int k = 0; k < v1.length; k++) + result += v1[k] * v2[k]; + + return result; + } + + @DataProvider(name = "BandPassComposition") + public Object[][] makeBandPassComposition() { + final List tests = new LinkedList(); + + for ( int bandPassSize : Arrays.asList(0, 1, 10, 100, BandPassActivityProfile.MAX_FILTER_SIZE) ) { + for ( int integrationLength : Arrays.asList(1, 10, 100, 1000) ) { + tests.add(new Object[]{ bandPassSize, integrationLength }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test( enabled = ! DEBUG, dataProvider = "BandPassComposition") + public void testBandPassComposition(final int bandPassSize, final int integrationLength) { + final int start = 1; + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, + ACTIVE_PROB_THRESHOLD, bandPassSize, BandPassActivityProfile.DEFAULT_SIGMA); + final double[] rawActiveProbs = new double[integrationLength + bandPassSize * 2]; + + // add a buffer so that we can get all of the band pass values + final String contig = genomeLocParser.getContigs().getSequences().get(0).getSequenceName(); + int pos = start; + int rawProbsOffset = 0; + for ( int i = 0; i < bandPassSize; i++ ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos++); + final ActivityProfileState state = new ActivityProfileState(loc, 0.0); + profile.add(state); + rawActiveProbs[rawProbsOffset++] = 0.0; + rawActiveProbs[rawActiveProbs.length - rawProbsOffset] = 0.0; + } + + for ( int i = 0; i < integrationLength; i++ ) { + final GenomeLoc nextLoc = genomeLocParser.createGenomeLoc(contig, pos++); + profile.add(new ActivityProfileState(nextLoc, 1.0)); + rawActiveProbs[rawProbsOffset++] = 1.0; + + for ( int j = 0; j < profile.size(); j++ ) { + Assert.assertTrue(profile.getStateList().get(j).isActiveProb >= 0.0, "State probability < 0 at " + j); + Assert.assertTrue(profile.getStateList().get(j).isActiveProb <= 1.0 + 1e-3, "State probability > 1 at " + j); + } + } + + final double[] expectedProbs = bandPassInOnePass(profile, rawActiveProbs); + for ( int j = 0; j < profile.size(); j++ ) { + Assert.assertEquals(profile.getStateList().get(j).isActiveProb, expectedProbs[j], "State probability not expected at " + j); + } + } + + // ------------------------------------------------------------------------------------ + // + // Code to test the creation of the kernels + // + // ------------------------------------------------------------------------------------ + + /** + + kernel <- function(sd, pThres) { + raw = dnorm(-80:81, mean=0, sd=sd) + norm = raw / sum(raw) + bad = norm < pThres + paste(norm[! bad], collapse=", ") + } + + print(kernel(0.01, 1e-5)) + print(kernel(1, 1e-5)) + print(kernel(5, 1e-5)) + print(kernel(17, 1e-5)) + + * @return + */ + + @DataProvider(name = "KernelCreation") + public Object[][] makeKernelCreation() { + final List tests = new LinkedList(); + + tests.add(new Object[]{ 0.01, 1000, new double[]{1.0}}); + tests.add(new Object[]{ 1.0, 1000, new double[]{0.0001338302, 0.004431848, 0.053990966, 0.241970723, 0.398942278, 0.241970723, 0.053990966, 0.004431848, 0.0001338302}}); + tests.add(new Object[]{ 1.0, 0, new double[]{1.0}}); + tests.add(new Object[]{ 1.0, 1, new double[]{0.2740686, 0.4518628, 0.2740686}}); + tests.add(new Object[]{ 1.0, 2, new double[]{0.05448868, 0.24420134, 0.40261995, 0.24420134, 0.05448868}}); + tests.add(new Object[]{ 1.0, 1000, new double[]{0.0001338302, 0.004431848, 0.053990966, 0.241970723, 0.398942278, 0.241970723, 0.053990966, 0.004431848, 0.0001338302}}); + tests.add(new Object[]{ 5.0, 1000, new double[]{1.1788613551308e-05, 2.67660451529771e-05, 5.83893851582921e-05, 0.000122380386022754, 0.000246443833694604, 0.000476817640292968, 0.000886369682387602, 0.00158309031659599, 0.00271659384673712, 0.00447890605896858, 0.00709491856924629, 0.0107981933026376, 0.0157900316601788, 0.0221841669358911, 0.029945493127149, 0.0388372109966426, 0.0483941449038287, 0.0579383105522965, 0.0666449205783599, 0.0736540280606647, 0.0782085387950912, 0.0797884560802865, 0.0782085387950912, 0.0736540280606647, 0.0666449205783599, 0.0579383105522965, 0.0483941449038287, 0.0388372109966426, 0.029945493127149, 0.0221841669358911, 0.0157900316601788, 0.0107981933026376, 0.00709491856924629, 0.00447890605896858, 0.00271659384673712, 0.00158309031659599, 0.000886369682387602, 0.000476817640292968, 0.000246443833694604, 0.000122380386022754, 5.83893851582921e-05, 2.67660451529771e-05, 1.1788613551308e-05}}); + tests.add(new Object[]{17.0, 1000, new double[]{1.25162575710745e-05, 1.57001772728555e-05, 1.96260034693739e-05, 2.44487374842009e-05, 3.03513668801384e-05, 3.75489089511911e-05, 4.62928204154855e-05, 5.68757597480354e-05, 6.96366758708924e-05, 8.49661819944029e-05, 0.000103312156275406, 0.000125185491708561, 0.000151165896477646, 0.000181907623161359, 0.000218144981137171, 0.000260697461819069, 0.000310474281706066, 0.000368478124457557, 0.000435807841336874, 0.00051365985048857, 0.000603327960854364, 0.000706201337376934, 0.000823760321812988, 0.000957569829285965, 0.00110927005589186, 0.00128056425833231, 0.00147320340358764, 0.00168896753568649, 0.00192964376796036, 0.00219700088266432, 0.00249276060490197, 0.00281856571330067, 0.00317594525418154, 0.00356627723683793, 0.00399074930220799, 0.00445031797242299, 0.00494566720070898, 0.00547716704583487, 0.00604483338842317, 0.00664828968356621, 0.00728673180099395, 0.00795889703644795, 0.00866303838230695, 0.00939690511889675, 0.0101577307281371, 0.010942229037054, 0.0117465993701676, 0.0125665413280325, 0.0133972796167302, 0.0142335991336574, 0.0150698902735454, 0.0159002041614507, 0.0167183172536454, 0.0175178044808441, 0.0182921198494897, 0.0190346831745763, 0.0197389714002676, 0.020398612780527, 0.0210074820484496, 0.0215597946062309, 0.0220501977225941, 0.022473856734247, 0.0228265343139947, 0.0231046609899767, 0.0233053952756892, 0.0234266719946158, 0.0234672376502799, 0.0234266719946158, 0.0233053952756892, 0.0231046609899767, 0.0228265343139947, 0.022473856734247, 0.0220501977225941, 0.0215597946062309, 0.0210074820484496, 0.020398612780527, 0.0197389714002676, 0.0190346831745763, 0.0182921198494897, 0.0175178044808441, 0.0167183172536454, 0.0159002041614507, 0.0150698902735454, 0.0142335991336574, 0.0133972796167302, 0.0125665413280325, 0.0117465993701676, 0.010942229037054, 0.0101577307281371, 0.00939690511889675, 0.00866303838230695, 0.00795889703644795, 0.00728673180099395, 0.00664828968356621, 0.00604483338842317, 0.00547716704583487, 0.00494566720070898, 0.00445031797242299, 0.00399074930220799, 0.00356627723683793, 0.00317594525418154, 0.00281856571330067, 0.00249276060490197, 0.00219700088266432, 0.00192964376796036, 0.00168896753568649, 0.00147320340358764, 0.00128056425833231, 0.00110927005589186, 0.000957569829285965, 0.000823760321812988, 0.000706201337376934, 0.000603327960854364, 0.00051365985048857, 0.000435807841336874, 0.000368478124457557, 0.000310474281706066, 0.000260697461819069, 0.000218144981137171, 0.000181907623161359, 0.000151165896477646, 0.000125185491708561, 0.000103312156275406, 8.49661819944029e-05, 6.96366758708924e-05, 5.68757597480354e-05, 4.62928204154855e-05, 3.75489089511911e-05, 3.03513668801384e-05, 2.44487374842009e-05, 1.96260034693739e-05, 1.57001772728555e-05, 1.25162575710745e-05}}); + + return tests.toArray(new Object[][]{}); + } + + @Test( enabled = ! DEBUG, dataProvider = "KernelCreation") + public void testKernelCreation(final double sigma, final int maxSize, final double[] expectedKernel) { + final BandPassActivityProfile profile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD, + maxSize, sigma, true); + + final double[] kernel = profile.getKernel(); + Assert.assertEquals(kernel.length, expectedKernel.length); + for ( int i = 0; i < kernel.length; i++ ) + Assert.assertEquals(kernel[i], expectedKernel[i], 1e-3, "Kernels not equal at " + i); + } + + // ------------------------------------------------------------------------------------ + // + // Large-scale test, reading in 1000G Phase I chr20 calls and making sure that + // the regions returned are the same if you run on the entire profile vs. doing it + // incremental + // + // ------------------------------------------------------------------------------------ + + @DataProvider(name = "VCFProfile") + public Object[][] makeVCFProfile() { + final List tests = new LinkedList(); + + //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 61000}); + //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 100000}); + //tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000}); + tests.add(new Object[]{ privateTestDir + "ALL.chr20.phase1_release_v3.20101123.snps_indels_svs.sites.vcf", "20", 60470, 1000000}); + tests.add(new Object[]{ privateTestDir + "NA12878.WGS.b37.chr20.firstMB.vcf", "20", 1, 1000000}); + + return tests.toArray(new Object[][]{}); + } + + @Test( dataProvider = "VCFProfile") + public void testVCFProfile(final String path, final String contig, final int start, final int end) throws Exception { + final int extension = 50; + final int minRegionSize = 50; + final int maxRegionSize = 300; + + final File file = new File(path); + final VCFCodec codec = new VCFCodec(); + final Pair> reader = VCIterable.readAllVCs(file, codec); + + final List incRegions = new ArrayList(); + final BandPassActivityProfile incProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); + final BandPassActivityProfile fullProfile = new BandPassActivityProfile(genomeLocParser, null, MAX_PROB_PROPAGATION_DISTANCE, ACTIVE_PROB_THRESHOLD); + int pos = start; + for ( final VariantContext vc : reader.getSecond() ) { + if ( vc == null ) continue; + while ( pos < vc.getStart() ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); + //logger.warn("Adding 0.0 at " + loc + " because vc.getStart is " + vc.getStart()); + incProfile.add(new ActivityProfileState(loc, 0.0)); + fullProfile.add(new ActivityProfileState(loc, 0.0)); + pos++; + } + if ( vc.getStart() >= start && vc.getEnd() <= end ) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, pos); + //logger.warn("Adding 1.0 at " + loc); + ActivityProfileState.Type type = ActivityProfileState.Type.NONE; + Number value = null; + if ( vc.isBiallelic() && vc.isIndel() ) { + type = ActivityProfileState.Type.HIGH_QUALITY_SOFT_CLIPS; + value = Math.abs(vc.getIndelLengths().get(0)); + } + final ActivityProfileState state = new ActivityProfileState(loc, 1.0, type, value); + incProfile.add(state); + fullProfile.add(state); + pos++; + } + + incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, false)); + + if ( vc.getStart() > end ) + break; + } + + incRegions.addAll(incProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true)); + + final List fullRegions = fullProfile.popReadyActiveRegions(extension, minRegionSize, maxRegionSize, true); + assertGoodRegions(fullRegions, start, end, maxRegionSize); + assertGoodRegions(incRegions, start, end, maxRegionSize); + + Assert.assertEquals(incRegions.size(), fullRegions.size(), "incremental and full region sizes aren't the same"); + for ( int i = 0; i < fullRegions.size(); i++ ) { + final ActiveRegion incRegion = incRegions.get(i); + final ActiveRegion fullRegion = fullRegions.get(i); + Assert.assertTrue(incRegion.equalExceptReads(fullRegion), "Full and incremental regions are not equal: full = " + fullRegion + " inc = " + incRegion); + } + } + + private void assertGoodRegions(final List regions, final int start, final int end, final int maxRegionSize) { + int lastPosSeen = start - 1; + for ( int regionI = 0; regionI < regions.size(); regionI++ ) { + final ActiveRegion region = regions.get(regionI); + Assert.assertEquals(region.getLocation().getStart(), lastPosSeen + 1, "discontinuous with previous region. lastPosSeen " + lastPosSeen + " but region is " + region); + Assert.assertTrue(region.getLocation().size() <= maxRegionSize, "Region is too big: " + region); + lastPosSeen = region.getLocation().getStop(); + + for ( final ActivityProfileState state : region.getSupportingStates() ) { + Assert.assertEquals(state.isActiveProb > ACTIVE_PROB_THRESHOLD, region.isActive(), + "Region is active=" + region.isActive() + " but contains a state " + state + " with prob " + + state.isActiveProb + " not within expected values given threshold for activity of " + + ACTIVE_PROB_THRESHOLD); + } + } + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/baq/BAQUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/classloader/JVMUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/classloader/JVMUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/classloader/JVMUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/classloader/JVMUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperTestUtils.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperTestUtils.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperTestUtils.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperTestUtils.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/clipping/ReadClipperUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/codecs/hapmap/HapMapUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/codecs/hapmap/HapMapUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/codecs/hapmap/HapMapUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/codecs/hapmap/HapMapUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/collections/DefaultHashMapUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/collections/DefaultHashMapUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/collections/DefaultHashMapUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/collections/DefaultHashMapUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayListUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayListUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayListUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/collections/ExpandingArrayListUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSiteUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSiteUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSiteUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSiteUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/ArgumentMatchSourceUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/ParsingEngineUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollectionUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollectionUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollectionUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingCollectionUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/commandline/RodBindingUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java new file mode 100644 index 000000000..61d346d5a --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -0,0 +1,219 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.downsampling; + +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMFileHeader; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + + +/** + * Basic unit test for AlleleBiasedDownsamplingUtils + */ +public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { + + + @Test + public void testSmartDownsampling() { + + final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; + final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; + + // no contamination, no removal + testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // het sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // hom sample, het contaminant, overlapping alleles + final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; + testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + + // hom sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); + + // het sample, het contaminant, overlapping alleles + testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); + testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + } + + private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, + final int pileupSize, final int[] initialCounts, final int[] targetCounts) { + + final int[] actualCounts = initialCounts.clone(); + actualCounts[0] += addA; + actualCounts[1] += addC; + actualCounts[2] += addG; + actualCounts[3] += addT; + + final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int) (pileupSize * contaminationFraction)); + Assert.assertTrue(countsAreEqual(results, targetCounts)); + } + + private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { + for ( int i = 0; i < 4; i++ ) { + if ( counts1[i] != counts2[i] ) + return false; + } + return true; + } + + @DataProvider(name = "BiasedDownsamplingTest") + public Object[][] makeBiasedDownsamplingTest() { + final List tests = new LinkedList(); + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + + for ( final int originalCount : Arrays.asList(1, 2, 10, 1000) ) { + for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { + if ( toRemove <= originalCount ) + tests.add(new Object[]{header, originalCount, toRemove}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "BiasedDownsamplingTest") + public void testBiasedDownsampling(final SAMFileHeader header, final int originalCount, final int toRemove) { + + final LinkedList elements = new LinkedList<>(); + for ( int i = 0; i < originalCount; i++ ) { + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); + elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); + } + + final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalCount, toRemove); + + Assert.assertEquals(result.size(), toRemove); + } + + @Test + public void testLoadContaminationFileDetails(){ + Logger logger=org.apache.log4j.Logger.getRootLogger(); + + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + final File ContamFile1=new File(ArtificalBAMLocation+"contamination.case.1.txt"); + + Map Contam1=new HashMap(); + Set Samples1=new HashSet(); + + Contam1.put("NA11918",0.15); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Contam1.put("NA12842",0.13); + Samples1.addAll(Contam1.keySet()); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + + Samples1.add("DUMMY"); + testLoadFile(ContamFile1,Samples1,Contam1,logger); + } + + private static void testLoadFile(final File file, final Set Samples, final Map map, Logger logger){ + Map loadedMap = AlleleBiasedDownsamplingUtils.loadContaminationFile(file,0.0,Samples,logger); + Assert.assertTrue(loadedMap.equals(map)); + } + + @DataProvider(name = "goodContaminationFiles") + public Integer[][] goodContaminationFiles() { + return new Integer[][]{ + {1, 2}, + {2, 3}, + {3, 2}, + {4, 2}, + {5, 3}, + {6, 2}, + {7, 2}, + {8, 2} + }; + } + + @Test(dataProvider = "goodContaminationFiles") + public void testLoadContaminationFile(final Integer ArtificalBAMnumber, final Integer numberOfSamples) { + final String ArtificialBAM = String.format("ArtificallyContaminatedBams/contamination.case.%d.txt", ArtificalBAMnumber); + Logger logger = org.apache.log4j.Logger.getRootLogger(); + + File ContamFile = new File(privateTestDir, ArtificialBAM); + Assert.assertTrue(AlleleBiasedDownsamplingUtils.loadContaminationFile(ContamFile, 0.0, null, logger).size() == numberOfSamples); + + } + + + @DataProvider(name = "badContaminationFiles") + public Integer[][] badContaminationFiles() { + return new Integer[][]{{1}, {2}, {3}, {4}, {5}}; + } + + @Test(dataProvider = "badContaminationFiles", expectedExceptions = UserException.MalformedFile.class) + public void testLoadBrokenContaminationFile(final int i) { + Logger logger = org.apache.log4j.Logger.getRootLogger(); + final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; + + File ContaminationFile = new File(ArtificalBAMLocation + String.format("contamination.case.broken.%d.txt", i)); + AlleleBiasedDownsamplingUtils.loadContaminationFile(ContaminationFile, 0.0, null, logger); + + } + + +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/file/FSLockWithSharedUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/file/FSLockWithSharedUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/file/FSLockWithSharedUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/file/FSLockWithSharedUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsBenchmark.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsBenchmark.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsBenchmark.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/fragments/FragmentUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/haplotype/EventMapUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/haplotype/HaplotypeUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java new file mode 100644 index 000000000..88b17c766 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/interval/IntervalUtilsUnitTest.java @@ -0,0 +1,1103 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.interval; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.util.Interval; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.SAMFileHeader; +import org.apache.commons.io.FileUtils; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.IntervalBinding; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.GenomeLocSortedSet; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * test out the interval utility methods + */ +public class IntervalUtilsUnitTest extends BaseTest { + // used to seed the genome loc parser with a sequence dictionary + private SAMFileHeader hg18Header; + private GenomeLocParser hg18GenomeLocParser; + private List hg18ReferenceLocs; + private SAMFileHeader hg19Header; + private GenomeLocParser hg19GenomeLocParser; + private List hg19ReferenceLocs; + private List hg19exomeIntervals; + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return hg18ReferenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); + return Collections.unmodifiableList(locs); + } + + @BeforeClass + public void init() { + File hg18Ref = new File(BaseTest.hg18Reference); + try { + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg18Ref); + hg18Header = new SAMFileHeader(); + hg18Header.setSequenceDictionary(seq.getSequenceDictionary()); + hg18GenomeLocParser = new GenomeLocParser(seq); + hg18ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(seq.getSequenceDictionary()).toList()) ; + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(hg18Ref,ex); + } + + File hg19Ref = new File(BaseTest.hg19Reference); + try { + final ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); + hg19Header = new SAMFileHeader(); + hg19Header.setSequenceDictionary(seq.getSequenceDictionary()); + hg19GenomeLocParser = new GenomeLocParser(seq); + hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(seq.getSequenceDictionary()).toList()) ; + + hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals))); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(hg19Ref,ex); + } + } + + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- + + private class IntervalSlicingTest extends TestDataProvider { + public int parts; + public double maxAllowableVariance; + + private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { + super(IntervalSlicingTest.class); + this.parts = parts; + this.maxAllowableVariance = maxAllowableVariance; + } + + public String toString() { + return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); + } + } + + @DataProvider(name = "intervalslicingdata") + public Object[][] createTrees() { + new IntervalSlicingTest(1, 0); + new IntervalSlicingTest(2, 1); + new IntervalSlicingTest(5, 1); + new IntervalSlicingTest(10, 1); + new IntervalSlicingTest(67, 1); + new IntervalSlicingTest(100, 1); + new IntervalSlicingTest(500, 1); + new IntervalSlicingTest(1000, 1); + return IntervalSlicingTest.getTests(IntervalSlicingTest.class); + } + + @Test(enabled = true, dataProvider = "intervalslicingdata") + public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); + + long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); + long idealSplitSize = totalSize / test.parts; + + long sumOfSplitSizes = 0; + int counter = 0; + for ( final List split : splits ) { + long splitSize = IntervalUtils.intervalSize(split); + double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); + //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + counter++; + sumOfSplitSizes += splitSize; + Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); + } + + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); + } + + // ------------------------------------------------------------------------------------- + // + // splitLocusIntervals tests + // + // ------------------------------------------------------------------------------------- + + /** large scale tests for many intervals */ + private class SplitLocusIntervalsTest extends TestDataProvider { + final List originalIntervals; + final public int parts; + + private SplitLocusIntervalsTest(final String name, List originalIntervals, final int parts) { + super(SplitLocusIntervalsTest.class, name); + this.parts = parts; + this.originalIntervals = originalIntervals; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "IntervalRepartitionTest") + public Object[][] createIntervalRepartitionTest() { + for ( int parts : Arrays.asList(1, 2, 3, 10, 13, 100, 151, 1000, 10000) ) { + //for ( int parts : Arrays.asList(10) ) { + new SplitLocusIntervalsTest("hg19RefLocs", hg19ReferenceLocs, parts); + new SplitLocusIntervalsTest("hg19ExomeLocs", hg19exomeIntervals, parts); + } + + return SplitLocusIntervalsTest.getTests(SplitLocusIntervalsTest.class); + } + + @Test(enabled = true, dataProvider = "IntervalRepartitionTest") + public void testIntervalRepartition(SplitLocusIntervalsTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts); + Assert.assertEquals(splitByLocus.size(), test.parts, "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test overall size + final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); + final long flatSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(flatSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + // test size of each split + final long ideal = (long)Math.floor(originalSize / (1.0 * test.parts)); + final long maxSize = ideal + (originalSize % test.parts) * test.parts; // no more than N * rounding error in size + for ( final List split : splitByLocus ) { + final long splitSize = IntervalUtils.intervalSize(split); + Assert.assertTrue(splitSize >= ideal && splitSize <= maxSize, + String.format("SplitLocusIntervals interval (start=%s) has size %d outside of bounds ideal=%d, max=%d", + split.get(0), splitSize, ideal, maxSize)); + } + + // test that every base in original is covered once by a base in split by locus intervals + String diff = IntervalUtils.equateIntervals(test.originalIntervals, flat); + Assert.assertNull(diff, diff); + } + + /** small scale tests where the expected cuts are enumerated upfront for testing */ + private class SplitLocusIntervalsSmallTest extends TestDataProvider { + final List original; + final public int parts; + final public int expectedParts; + final List expected; + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected) { + this(name, originalIntervals, parts, expected, parts); + } + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected, int expectedParts) { + super(SplitLocusIntervalsSmallTest.class, name); + this.parts = parts; + this.expectedParts = expectedParts; + this.original = originalIntervals; + this.expected = expected; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "SplitLocusIntervalsSmallTest") + public Object[][] createSplitLocusIntervalsSmallTest() { + GenomeLoc bp01_10 = hg19GenomeLocParser.createGenomeLoc("1", 1, 10); + + GenomeLoc bp1_5 = hg19GenomeLocParser.createGenomeLoc("1", 1, 5); + GenomeLoc bp6_10 = hg19GenomeLocParser.createGenomeLoc("1", 6, 10); + new SplitLocusIntervalsSmallTest("cut into two", Arrays.asList(bp01_10), 2, Arrays.asList(bp1_5, bp6_10)); + + GenomeLoc bp20_30 = hg19GenomeLocParser.createGenomeLoc("1", 20, 30); + new SplitLocusIntervalsSmallTest("two in two", Arrays.asList(bp01_10, bp20_30), 2, Arrays.asList(bp01_10, bp20_30)); + + GenomeLoc bp1_7 = hg19GenomeLocParser.createGenomeLoc("1", 1, 7); + GenomeLoc bp8_10 = hg19GenomeLocParser.createGenomeLoc("1", 8, 10); + GenomeLoc bp20_23 = hg19GenomeLocParser.createGenomeLoc("1", 20, 23); + GenomeLoc bp24_30 = hg19GenomeLocParser.createGenomeLoc("1", 24, 30); + new SplitLocusIntervalsSmallTest("two in three", Arrays.asList(bp01_10, bp20_30), 3, + Arrays.asList(bp1_7, bp8_10, bp20_23, bp24_30)); + + GenomeLoc bp1_2 = hg19GenomeLocParser.createGenomeLoc("1", 1, 2); + GenomeLoc bp1_1 = hg19GenomeLocParser.createGenomeLoc("1", 1, 1); + GenomeLoc bp2_2 = hg19GenomeLocParser.createGenomeLoc("1", 2, 2); + new SplitLocusIntervalsSmallTest("too many pieces", Arrays.asList(bp1_2), 5, Arrays.asList(bp1_1, bp2_2), 2); + + new SplitLocusIntervalsSmallTest("emptyList", Collections.emptyList(), 5, Collections.emptyList(), 0); + + return SplitLocusIntervalsSmallTest.getTests(SplitLocusIntervalsSmallTest.class); + } + + @Test(enabled = true, dataProvider = "SplitLocusIntervalsSmallTest") + public void splitLocusIntervalsSmallTest(SplitLocusIntervalsSmallTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.original, test.parts); + Assert.assertEquals(splitByLocus.size(), test.expectedParts, "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test sizes + final long originalSize = IntervalUtils.intervalSize(test.original); + final long splitSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(splitSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + Assert.assertEquals(flat, test.expected, "SplitLocusIntervals locs not expected intervals"); + } + + // + // Misc. tests + // + + @Test(expectedExceptions=UserException.class) + public void testMergeListsBySetOperatorNoOverlap() { + // a couple of lists we'll use for the testing + List listEveryTwoFromOne = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + else + listEveryTwoFromOne.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 100); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, null); + Assert.assertEquals(ret.size(), 100); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, listEveryTwoFromOne, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 0); + } + + @Test + public void testMergeListsBySetOperatorAllOverlap() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 2 == 0) + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 150); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); + Assert.assertEquals(ret.size(), 150); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 50); + } + + @Test + public void testMergeListsBySetOperator() { + // a couple of lists we'll use for the testing + List allSites = new ArrayList(); + List listEveryTwoFromTwo = new ArrayList(); + + // create the two lists we'll use + for (int x = 1; x < 101; x++) { + if (x % 5 == 0) { + listEveryTwoFromTwo.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + allSites.add(hg18GenomeLocParser.createGenomeLoc("chr1",x,x)); + } + } + + List ret; + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.UNION); + Assert.assertEquals(ret.size(), 40); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, null); + Assert.assertEquals(ret.size(), 40); + ret = IntervalUtils.mergeListsBySetOperator(listEveryTwoFromTwo, allSites, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 20); + } + + @Test + public void testOverlappingIntervalsFromSameSourceWithIntersection() { + // a couple of lists we'll use for the testing + List source1 = new ArrayList(); + List source2 = new ArrayList(); + + source1.add(hg18GenomeLocParser.createGenomeLoc("chr1", 10, 20)); + source1.add(hg18GenomeLocParser.createGenomeLoc("chr1", 15, 25)); + + source2.add(hg18GenomeLocParser.createGenomeLoc("chr1", 16, 18)); + source2.add(hg18GenomeLocParser.createGenomeLoc("chr1", 22, 24)); + + List ret = IntervalUtils.mergeListsBySetOperator(source1, source2, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 2); + } + + @Test + public void testGetContigLengths() { + Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); + Assert.assertEquals((long)lengths.get("chr1"), 247249719); + Assert.assertEquals((long)lengths.get("chr2"), 242951149); + Assert.assertEquals((long)lengths.get("chr3"), 199501827); + Assert.assertEquals((long)lengths.get("chr20"), 62435964); + Assert.assertEquals((long)lengths.get("chrX"), 154913754); + } + + @Test + public void testParseIntervalArguments() { + Assert.assertEquals(getLocs().size(), 45); + Assert.assertEquals(getLocs("chr1", "chr2", "chr3").size(), 3); + Assert.assertEquals(getLocs("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2").size(), 4); + } + + @Test + public void testIsIntervalFile() { + Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.privateTestDir + "empty_intervals.list")); + Assert.assertTrue(IntervalUtils.isIntervalFile(BaseTest.privateTestDir + "empty_intervals.list", true)); + + List extensions = Arrays.asList("bed", "interval_list", "intervals", "list", "picard"); + for (String extension: extensions) { + Assert.assertTrue(IntervalUtils.isIntervalFile("test_intervals." + extension, false), "Tested interval file extension: " + extension); + } + } + + @Test(expectedExceptions = UserException.CouldNotReadInputFile.class) + public void testMissingIntervalFile() { + IntervalUtils.isIntervalFile(BaseTest.privateTestDir + "no_such_intervals.list"); + } + + @Test + public void testFixedScatterIntervalsBasic() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + + List files = testFiles("basic.", 3, ".intervals"); + + List locs = getLocs("chr1", "chr2", "chr3"); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterFixedIntervalsLessFiles() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); + + List files = testFiles("less.", 3, ".intervals"); + + List locs = getLocs("chr1", "chr2", "chr3", "chr4"); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + Assert.assertEquals(locs3.get(1), chr4); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testSplitFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + IntervalUtils.splitFixedIntervals(locs, files.size()); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testScatterFixedIntervalsMoreFiles() { + List files = testFiles("more.", 3, ".intervals"); + List locs = getLocs("chr1", "chr2"); + List> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + } + @Test + public void testScatterFixedIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs2.get(0), chr1b); + Assert.assertEquals(locs3.get(0), chr2); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs3.get(0), chr2b); + Assert.assertEquals(locs3.get(1), chr3); + } + + @Test + public void testScatterFixedIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); + + List files = testFiles("split.", 3, ".intervals"); + + List locs = getLocs(intervals); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3a); + Assert.assertEquals(locs3.get(0), chr3b); + } + + @Test + public void testScatterFixedIntervalsFile() { + List files = testFiles("sg.", 20, ".intervals"); + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list")); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + + int[] counts = { + 125, 138, 287, 291, 312, 105, 155, 324, + 295, 298, 141, 121, 285, 302, 282, 88, + 116, 274, 282, 248 +// 5169, 5573, 10017, 10567, 10551, +// 5087, 4908, 10120, 10435, 10399, +// 5391, 4735, 10621, 10352, 10654, +// 5227, 5256, 10151, 9649, 9825 + }; + + //String splitCounts = ""; + for (int i = 0; i < splits.size(); i++) { + int splitCount = splits.get(i).size(); + Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); + } + //System.out.println(splitCounts.substring(2)); + + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); + + int locIndex = 0; + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file)); + Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); + for (GenomeLoc parsedLoc: parsedLocs) + Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); + } + Assert.assertEquals(locIndex, locs.size(), "Total number of GenomeLocs"); + } + + @Test + public void testScatterFixedIntervalsMax() { + List files = testFiles("sg.", 85, ".intervals"); + List> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); + IntervalUtils.scatterFixedIntervals(hg19Header, splits, files); + + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file)); + Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); + Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); + } + } + + @Test + public void testScatterContigIntervalsOrder() { + List intervals = Arrays.asList("chr2:1-1", "chr1:1-1", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("split.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr2); + Assert.assertEquals(locs2.get(0), chr1); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsBasic() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + + List files = testFiles("contig_basic.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsLessFiles() { + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3"); + GenomeLoc chr4 = hg18GenomeLocParser.parseGenomeLoc("chr4"); + + List files = testFiles("contig_less.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs1.get(1), chr2); + Assert.assertEquals(locs2.get(0), chr3); + Assert.assertEquals(locs3.get(0), chr4); + } + + @Test(expectedExceptions=UserException.BadInput.class) + public void testScatterContigIntervalsMoreFiles() { + List files = testFiles("contig_more.", 3, ".intervals"); + IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2"), files); + } + + @Test + public void testScatterContigIntervalsStart() { + List intervals = Arrays.asList("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2"); + GenomeLoc chr1a = hg18GenomeLocParser.parseGenomeLoc("chr1:1-2"); + GenomeLoc chr1b = hg18GenomeLocParser.parseGenomeLoc("chr1:4-5"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:1-1"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("contig_split_start.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 2); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1a); + Assert.assertEquals(locs1.get(1), chr1b); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsMiddle() { + List intervals = Arrays.asList("chr1:1-1", "chr2:1-2", "chr2:4-5", "chr3:2-2"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2a = hg18GenomeLocParser.parseGenomeLoc("chr2:1-2"); + GenomeLoc chr2b = hg18GenomeLocParser.parseGenomeLoc("chr2:4-5"); + GenomeLoc chr3 = hg18GenomeLocParser.parseGenomeLoc("chr3:2-2"); + + List files = testFiles("contig_split_middle.", 3, ".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 2); + Assert.assertEquals(locs3.size(), 1); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2a); + Assert.assertEquals(locs2.get(1), chr2b); + Assert.assertEquals(locs3.get(0), chr3); + } + + @Test + public void testScatterContigIntervalsEnd() { + List intervals = Arrays.asList("chr1:1-1", "chr2:2-2", "chr3:1-2", "chr3:4-5"); + GenomeLoc chr1 = hg18GenomeLocParser.parseGenomeLoc("chr1:1-1"); + GenomeLoc chr2 = hg18GenomeLocParser.parseGenomeLoc("chr2:2-2"); + GenomeLoc chr3a = hg18GenomeLocParser.parseGenomeLoc("chr3:1-2"); + GenomeLoc chr3b = hg18GenomeLocParser.parseGenomeLoc("chr3:4-5"); + + List files = testFiles("contig_split_end.", 3 ,".intervals"); + + IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); + + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); + + Assert.assertEquals(locs1.size(), 1); + Assert.assertEquals(locs2.size(), 1); + Assert.assertEquals(locs3.size(), 2); + + Assert.assertEquals(locs1.get(0), chr1); + Assert.assertEquals(locs2.get(0), chr2); + Assert.assertEquals(locs3.get(0), chr3a); + Assert.assertEquals(locs3.get(1), chr3b); + } + + @Test + public void testScatterContigIntervalsMax() { + List files = testFiles("sg.", 85, ".intervals"); + IntervalUtils.scatterContigIntervals(hg19Header, hg19ReferenceLocs, files); + + for (int i = 0; i < files.size(); i++) { + String file = files.get(i).toString(); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file)); + Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); + Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); + } + } + + private List testFiles(String prefix, int count, String suffix) { + ArrayList files = new ArrayList(); + for (int i = 1; i <= count; i++) { + files.add(createTempFile(prefix + i, suffix)); + } + return files; + } + + @DataProvider(name="unmergedIntervals") + public Object[][] getUnmergedIntervals() { + return new Object[][] { + new Object[] {"small_unmerged_picard_intervals.list"}, + new Object[] {"small_unmerged_gatk_intervals.list"} + }; + } + + @Test(dataProvider="unmergedIntervals") + public void testUnmergedIntervals(String unmergedIntervals) { + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(privateTestDir + unmergedIntervals)); + Assert.assertEquals(locs.size(), 2); + + List merged; + + merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); + Assert.assertEquals(merged.size(), 1); + + // Test that null means the same as ALL + merged = IntervalUtils.mergeIntervalLocations(locs, null); + Assert.assertEquals(merged.size(), 1); + } + + /* + Split into tests that can be written to files and tested by writeFlankingIntervals, + and lists that cannot but are still handled by getFlankingIntervals. + */ + private static abstract class FlankingIntervalsTestData extends TestDataProvider { + final public File referenceFile; + final public GenomeLocParser parser; + final int basePairs; + final List original; + final List expected; + + protected FlankingIntervalsTestData(Class clazz, String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(clazz, name); + this.referenceFile = referenceFile; + this.parser = parser; + this.basePairs = basePairs; + this.original = parse(parser, original); + this.expected = parse(parser, expected); + } + + private static List parse(GenomeLocParser parser, List locs) { + List parsed = new ArrayList(); + for (String loc: locs) + parsed.add("unmapped".equals(loc) ? GenomeLoc.UNMAPPED : parser.parseGenomeLoc(loc)); + return parsed; + } + } + + private static class FlankingIntervalsFile extends FlankingIntervalsTestData { + public FlankingIntervalsFile(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsFile.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + private static class FlankingIntervalsList extends FlankingIntervalsTestData { + public FlankingIntervalsList(String name, File referenceFile, GenomeLocParser parser, + int basePairs, List original, List expected) { + super(FlankingIntervalsList.class, name, referenceFile, parser, basePairs, original, expected); + } + } + + /* Intervals where the original and the flanks can be written to files. */ + @DataProvider(name = "flankingIntervalsFiles") + public Object[][] getFlankingIntervalsFiles() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + int hg19Length1 = hg19GenomeLocParser.getContigInfo("1").getSequenceLength(); + + new FlankingIntervalsFile("atStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:1"), + Arrays.asList("1:2")); + + new FlankingIntervalsFile("atStartBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1"), + Arrays.asList("1:2-51")); + + new FlankingIntervalsFile("atStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:1-10"), + Arrays.asList("1:11-60")); + + new FlankingIntervalsFile("atEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + hg19Length1), + Arrays.asList("1:" + (hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndBase50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:" + hg19Length1), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 50, hg19Length1 - 1))); + + new FlankingIntervalsFile("atEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 10, hg19Length1)), + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 60, hg19Length1 - 11))); + + new FlankingIntervalsFile("nearStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:2"), + Arrays.asList("1:1", "1:3")); + + new FlankingIntervalsFile("nearStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:21-30"), + Arrays.asList("1:1-20", "1:31-80")); + + new FlankingIntervalsFile("nearEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 1)), + Arrays.asList("1:" + (hg19Length1 - 2), "1:" + hg19Length1)); + + new FlankingIntervalsFile("nearEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 30, hg19Length1 - 21)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 80, hg19Length1 - 31), + String.format("1:%d-%d", hg19Length1 - 20, hg19Length1))); + + new FlankingIntervalsFile("beyondStartBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:3"), + Arrays.asList("1:2", "1:4")); + + new FlankingIntervalsFile("beyondStartRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200"), + Arrays.asList("1:51-100", "1:201-250")); + + new FlankingIntervalsFile("beyondEndBase1", hg19ReferenceFile, hg19GenomeLocParser, 1, + Arrays.asList("1:" + (hg19Length1 - 3)), + Arrays.asList("1:" + (hg19Length1 - 4), "1:" + (hg19Length1 - 2))); + + new FlankingIntervalsFile("beyondEndRange50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList(String.format("1:%d-%d", hg19Length1 - 200, hg19Length1 - 101)), + Arrays.asList( + String.format("1:%d-%d", hg19Length1 - 250, hg19Length1 - 201), + String.format("1:%d-%d", hg19Length1 - 100, hg19Length1 - 51))); + + new FlankingIntervalsFile("betweenFar50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:401-500"), + Arrays.asList("1:51-100", "1:201-250", "1:351-400", "1:501-550")); + + new FlankingIntervalsFile("betweenSpan50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + new FlankingIntervalsFile("betweenOverlap50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:271-400"), + Arrays.asList("1:51-100", "1:201-270", "1:401-450")); + + new FlankingIntervalsFile("betweenShort50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:221-400"), + Arrays.asList("1:51-100", "1:201-220", "1:401-450")); + + new FlankingIntervalsFile("betweenNone50", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:121-400"), + Arrays.asList("1:51-100", "1:401-450")); + + new FlankingIntervalsFile("twoContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "2:301-400"), + Arrays.asList("1:51-100", "1:201-250", "2:251-300", "2:401-450")); + + // Explicit testing a problematic agilent target pair + new FlankingIntervalsFile("badAgilent", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("2:74756257-74756411", "2:74756487-74756628"), + // wrong! ("2:74756206-74756256", "2:74756412-74756462", "2:74756436-74756486", "2:74756629-74756679") + Arrays.asList("2:74756207-74756256", "2:74756412-74756486", "2:74756629-74756678")); + + return TestDataProvider.getTests(FlankingIntervalsFile.class); + } + + /* Intervals where either the original and/or the flanks cannot be written to a file. */ + @DataProvider(name = "flankingIntervalsLists") + public Object[][] getFlankingIntervalsLists() { + File hg19ReferenceFile = new File(BaseTest.hg19Reference); + List empty = Collections.emptyList(); + + new FlankingIntervalsList("empty", hg19ReferenceFile, hg19GenomeLocParser, 50, + empty, + empty); + + new FlankingIntervalsList("unmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("unmapped"), + empty); + + new FlankingIntervalsList("fullContig", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1"), + empty); + + new FlankingIntervalsList("fullContigs", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1", "2", "3"), + empty); + + new FlankingIntervalsList("betweenWithUnmapped", hg19ReferenceFile, hg19GenomeLocParser, 50, + Arrays.asList("1:101-200", "1:301-400", "unmapped"), + Arrays.asList("1:51-100", "1:201-300", "1:401-450")); + + return TestDataProvider.getTests(FlankingIntervalsList.class); + } + + @Test(dataProvider = "flankingIntervalsFiles") + public void testWriteFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + + List actual = IntervalUtils.intervalFileToList(data.parser, flankingFile.getAbsolutePath()); + + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists", expectedExceptions = UserException.class) + public void testWritingBadFlankingIntervals(FlankingIntervalsTestData data) throws Exception { + File originalFile = createTempFile("original.", ".intervals"); + File flankingFile = createTempFile("flanking.", ".intervals"); + try { + List lines = new ArrayList(); + for (GenomeLoc loc: data.original) + lines.add(loc.toString()); + FileUtils.writeLines(originalFile, lines); + + // Should throw a user exception on bad input if either the original + // intervals are empty or if the flanking intervals are empty + IntervalUtils.writeFlankingIntervals(data.referenceFile, originalFile, flankingFile, data.basePairs); + } finally { + FileUtils.deleteQuietly(originalFile); + FileUtils.deleteQuietly(flankingFile); + } + } + + @Test(dataProvider = "flankingIntervalsLists") + public void testGetFlankingIntervals(FlankingIntervalsTestData data) { + List actual = IntervalUtils.getFlankingIntervals(data.parser, data.original, data.basePairs); + String description = String.format("%n name: %s%n original: %s%n actual: %s%n expected: %s%n", + data.toString(), data.original, actual, data.expected); + Assert.assertEquals(actual, data.expected, description); + } + + @Test(expectedExceptions=UserException.BadArgumentValue.class) + public void testExceptionUponLegacyIntervalSyntax() throws Exception { + final GenomeLocParser parser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference))); + + // Attempting to use the legacy -L "interval1;interval2" syntax should produce an exception: + IntervalBinding binding = new IntervalBinding("1;2"); + binding.getIntervals(parser); + } + + @DataProvider(name="invalidIntervalTestData") + public Object[][] invalidIntervalDataProvider() throws Exception { + File fastaFile = new File(publicTestDir + "exampleFASTA.fasta"); + GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); + + return new Object[][] { + new Object[] {genomeLocParser, "chr1", 10000000, 20000000}, + new Object[] {genomeLocParser, "chr2", 1, 2}, + new Object[] {genomeLocParser, "chr1", -1, 50} + }; + } + + @Test(dataProvider="invalidIntervalTestData") + public void testInvalidPicardIntervalHandling(GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + SAMFileHeader picardFileHeader = new SAMFileHeader(); + picardFileHeader.addSequence(genomeLocParser.getContigInfo("chr1")); + IntervalList picardIntervals = new IntervalList(picardFileHeader); + picardIntervals.add(new Interval(contig, intervalStart, intervalEnd, true, "dummyname")); + + File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); + picardIntervals.write(picardIntervalFile); + + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); + + IntervalUtils.loadIntervals(intervalArgs, IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser); + } + + @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + public void testInvalidGATKFileIntervalHandling(GenomeLocParser genomeLocParser, + String contig, int intervalStart, int intervalEnd ) throws Exception { + + File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", + String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); + + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); + + IntervalUtils.loadIntervals(intervalArgs, IntervalSetRule.UNION, IntervalMergingRule.ALL, 0, genomeLocParser); + } + + private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { + File tempFile = BaseTest.createTempFile(tempFilePrefix, tempFileExtension); + FileUtils.writeLines(tempFile, Arrays.asList(lines)); + return tempFile; + } + + @DataProvider(name = "sortAndMergeIntervals") + public Object[][] getSortAndMergeIntervals() { + return new Object[][] { + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1", "chr1:2", "chr1:3") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr1:2"), getLocs("chr1:1-3") }, + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1:3", "chr2:2"), getLocs("chr1:1", "chr1:3", "chr2:2") }, + new Object[] { IntervalMergingRule.OVERLAPPING_ONLY, getLocs("chr1:1", "chr1"), getLocs("chr1") }, + new Object[] { IntervalMergingRule.ALL, getLocs("chr1:1", "chr1"), getLocs("chr1") } + }; + } + + @Test(dataProvider = "sortAndMergeIntervals") + public void testSortAndMergeIntervals(IntervalMergingRule merge, List unsorted, List expected) { + List sorted = IntervalUtils.sortAndMergeIntervals(hg18GenomeLocParser, unsorted, merge).toList(); + Assert.assertEquals(sorted, expected); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/io/IOUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/io/IOUtilsUnitTest.java new file mode 100644 index 000000000..46b2e949f --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/io/IOUtilsUnitTest.java @@ -0,0 +1,326 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.io; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.gatk.utils.BaseTest; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Random; + +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +public class IOUtilsUnitTest extends BaseTest { + @Test + public void testGoodTempDir() { + IOUtils.checkTempDir(new File("/tmp/queue")); + } + + @Test(expectedExceptions=UserException.BadTmpDir.class) + public void testBadTempDir() { + IOUtils.checkTempDir(new File("/tmp")); + } + + @Test + public void testAbsoluteSubDir() { + File subDir = IOUtils.absolute(new File("."), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File(".")); + Assert.assertEquals(subDir, new File("/different/path")); + } + + @Test + public void testRelativeSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/to/file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("/different/path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/path/path/to/file")); + } + + @Test + public void testDottedSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/../to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/../to/./file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("."), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + + subDir = IOUtils.absolute(new File("/different/../path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/../path/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/./path"), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + } + + @Test + public void testTempDir() { + File tempDir = IOUtils.tempDir("Q-Unit-Test", "", new File("queueTempDirToDelete")); + Assert.assertTrue(tempDir.exists()); + Assert.assertFalse(tempDir.isFile()); + Assert.assertTrue(tempDir.isDirectory()); + boolean deleted = IOUtils.tryDelete(tempDir); + Assert.assertTrue(deleted); + Assert.assertFalse(tempDir.exists()); + } + + @Test + public void testDirLevel() { + File dir = IOUtils.dirLevel(new File("/path/to/directory"), 1); + Assert.assertEquals(dir, new File("/path")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 2); + Assert.assertEquals(dir, new File("/path/to")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 3); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 4); + Assert.assertEquals(dir, new File("/path/to/directory")); + } + + @Test + public void testAbsolute() { + File dir = IOUtils.absolute(new File("/path/./to/./directory/.")); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.absolute(new File("/")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/././.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/./directory/.")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory/./")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory./")); + Assert.assertEquals(dir, new File("/directory.")); + + dir = IOUtils.absolute(new File("/./.directory/")); + Assert.assertEquals(dir, new File("/.directory")); + } + + @Test + public void testTail() throws IOException { + List lines = Arrays.asList( + "chr18_random 4262 3154410390 50 51", + "chr19_random 301858 3154414752 50 51", + "chr21_random 1679693 3154722662 50 51", + "chr22_random 257318 3156435963 50 51", + "chrX_random 1719168 3156698441 50 51"); + List tail = IOUtils.tail(new File(BaseTest.hg18Reference + ".fai"), 5); + Assert.assertEquals(tail.size(), 5); + for (int i = 0; i < 5; i++) + Assert.assertEquals(tail.get(i), lines.get(i)); + } + + @Test + public void testWriteSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("testProperties.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteSystemTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("testProperties.properties", null)); + try { + Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("MissingStingText.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("/testProperties.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("/testProperties.properties", IOUtils.class)); + try { + Assert.assertTrue(temp.getName().startsWith("testProperties"), "File does not start with 'testProperties.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + // Looking for /org/broadinstitute/gatk/utils/file/GATKText.properties + IOUtils.writeResource(new Resource("GATKText.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testResourceProperties() { + Resource resource = new Resource("foo", Resource.class); + Assert.assertEquals(resource.getPath(), "foo"); + Assert.assertEquals(resource.getRelativeClass(), Resource.class); + } + + @Test + public void testIsSpecialFile() { + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/null"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/full"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stdout"))); + Assert.assertTrue(IOUtils.isSpecialFile(new File("/dev/stderr"))); + Assert.assertFalse(IOUtils.isSpecialFile(null)); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); + Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); + } + + @DataProvider( name = "ByteArrayIOTestData") + public Object[][] byteArrayIOTestDataProvider() { + return new Object[][] { + // file size, read buffer size + { 0, 4096 }, + { 1, 4096 }, + { 2000, 4096 }, + { 4095, 4096 }, + { 4096, 4096 }, + { 4097, 4096 }, + { 6000, 4096 }, + { 8191, 4096 }, + { 8192, 4096 }, + { 8193, 4096 }, + { 10000, 4096 } + }; + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToFile(dataWritten, tempFile); + byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); + byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentFileIntoByteArray() { + File nonExistentFile = new File("djfhsdkjghdfk"); + Assert.assertFalse(nonExistentFile.exists()); + + IOUtils.readFileIntoByteArray(nonExistentFile); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testReadNullStreamIntoByteArray() { + IOUtils.readStreamIntoByteArray(null); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { + IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), + -1); + } + + @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) + public void testWriteByteArrayToUncreatableFile() { + IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testWriteNullByteArrayToFile() { + IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); + } + + @Test( expectedExceptions = ReviewedGATKException.class ) + public void testWriteByteArrayToNullStream() { + IOUtils.writeByteArrayToStream(new byte[]{0}, null); + } + + private byte[] getDeterministicRandomData ( int size ) { + Utils.resetRandomGenerator(); + Random rand = Utils.getRandomGenerator(); + + byte[] randomData = new byte[size]; + rand.nextBytes(randomData); + + return randomData; + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/clibrary/LibCUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/clibrary/LibCUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/clibrary/LibCUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/clibrary/LibCUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionQueueTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionQueueTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionQueueTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/JnaSessionQueueTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaaQueueTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaaQueueTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaaQueueTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/drmaa/v1_0/LibDrmaaQueueTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBatQueueTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBatQueueTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBatQueueTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/jna/lsf/v7_0_6/LibBatQueueTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachineUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachineUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachineUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/AlignmentStateMachineUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LIBS_position.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LIBS_position.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LIBS_position.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LIBS_position.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorBenchmark.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorBenchmark.java new file mode 100644 index 000000000..0d06c61c6 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorBenchmark.java @@ -0,0 +1,142 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; + +import java.util.LinkedList; +import java.util.List; + +/** + * Caliper microbenchmark of fragment pileup + */ +public class LocusIteratorBenchmark extends SimpleBenchmark { + protected SAMFileHeader header; + protected GenomeLocParser genomeLocParser; + + List reads = new LinkedList(); + final int readLength = 101; + final int nReads = 10000; + final int locus = 1; + + @Param({"101M", "50M10I40M", "50M10D40M"}) + String cigar; // set automatically by framework + + @Override protected void setUp() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + + for ( int j = 0; j < nReads; j++ ) { + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + final byte[] quals = new byte[readLength]; + for ( int i = 0; i < readLength; i++ ) + quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); + read.setBaseQualities(quals); + read.setCigarString(cigar); + reads.add(read); + } + } + +// public void timeOriginalLIBS(int rep) { +// for ( int i = 0; i < rep; i++ ) { +// final org.broadinstitute.gatk.utils.locusiterator.old.LocusIteratorByState libs = +// new org.broadinstitute.gatk.utils.locusiterator.old.LocusIteratorByState( +// new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), +// LocusIteratorByStateBaseTest.createTestReadProperties(), +// genomeLocParser, +// LocusIteratorByState.sampleListForSAMWithoutReadGroups()); +// +// while ( libs.hasNext() ) { +// AlignmentContext context = libs.next(); +// } +// } +// } +// +// public void timeLegacyLIBS(int rep) { +// for ( int i = 0; i < rep; i++ ) { +// final org.broadinstitute.gatk.utils.locusiterator.legacy.LegacyLocusIteratorByState libs = +// new org.broadinstitute.gatk.utils.locusiterator.legacy.LegacyLocusIteratorByState( +// new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), +// LocusIteratorByStateBaseTest.createTestReadProperties(), +// genomeLocParser, +// LocusIteratorByState.sampleListForSAMWithoutReadGroups()); +// +// while ( libs.hasNext() ) { +// AlignmentContext context = libs.next(); +// } +// } +// } + + public void timeNewLIBS(int rep) { + for ( int i = 0; i < rep; i++ ) { + final org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState libs = + new org.broadinstitute.gatk.utils.locusiterator.LocusIteratorByState( + new LocusIteratorByStateBaseTest.FakeCloseableIterator(reads.iterator()), + null, true, false, + genomeLocParser, + LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + + while ( libs.hasNext() ) { + AlignmentContext context = libs.next(); + } + } + } + +// public void timeOriginalLIBSStateMachine(int rep) { +// for ( int i = 0; i < rep; i++ ) { +// for ( final SAMRecord read : reads ) { +// final SAMRecordAlignmentState alignmentStateMachine = new SAMRecordAlignmentState(read); +// while ( alignmentStateMachine.stepForwardOnGenome() != null ) { +// alignmentStateMachine.getGenomeOffset(); +// } +// } +// } +// } + + public void timeAlignmentStateMachine(int rep) { + for ( int i = 0; i < rep; i++ ) { + for ( final GATKSAMRecord read : reads ) { + final AlignmentStateMachine alignmentStateMachine = new AlignmentStateMachine(read); + while ( alignmentStateMachine.stepForwardOnGenome() != null ) { + ; + } + } + } + } + + public static void main(String[] args) { + com.google.caliper.Runner.main(LocusIteratorBenchmark.class, args); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateBaseTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateBaseTest.java new file mode 100644 index 000000000..073d69cde --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateBaseTest.java @@ -0,0 +1,232 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import htsjdk.samtools.*; +import htsjdk.samtools.util.CloseableIterator; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; + +import java.util.*; + +/** + * testing of the new (non-legacy) version of LocusIteratorByState + */ +public class LocusIteratorByStateBaseTest extends BaseTest { + protected static SAMFileHeader header; + protected GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + protected LocusIteratorByState makeLTBS(List reads) { + return makeLTBS(reads, null, false); + } + + protected LocusIteratorByState makeLTBS(final List reads, + final DownsamplingMethod downsamplingMethod, + final boolean keepUniqueReadList) { + return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), + downsamplingMethod, true, keepUniqueReadList, + genomeLocParser, + LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + + public static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; + + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() {} + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } + } + + protected static class LIBSTest { + public static final int locus = 44367788; + final String cigarString; + final int readLength; + final private List elements; + + public LIBSTest(final String cigarString) { + final Cigar cigar = TextCigarCodec.getSingleton().decode(cigarString); + this.cigarString = cigarString; + this.elements = cigar.getCigarElements(); + this.readLength = cigar.getReadLength(); + } + + @Override + public String toString() { + return "LIBSTest{" + + "cigar='" + cigarString + '\'' + + ", readLength=" + readLength + + '}'; + } + + public List getElements() { + return elements; + } + + public GATKSAMRecord makeRead() { + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + final byte[] quals = new byte[readLength]; + for ( int i = 0; i < readLength; i++ ) + quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); + read.setBaseQualities(quals); + read.setCigarString(cigarString); + return read; + } + } + + private boolean isIndel(final CigarElement ce) { + return ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I; + } + + private boolean startsWithDeletion(final List elements) { + for ( final CigarElement element : elements ) { + switch ( element.getOperator() ) { + case M: + case I: + case EQ: + case X: + return false; + case D: + return true; + default: + // keep looking + } + } + + return false; + } + + private LIBSTest makePermutationTest(final List elements) { + CigarElement last = null; + boolean hasMatch = false; + + // starts with D => bad + if ( startsWithDeletion(elements) ) + return null; + + // ends with D => bad + if ( elements.get(elements.size()-1).getOperator() == CigarOperator.D ) + return null; + + // make sure it's valid + String cigar = ""; + int len = 0; + for ( final CigarElement ce : elements ) { + if ( ce.getOperator() == CigarOperator.N ) + return null; // TODO -- don't support N + + // abort on a bad cigar + if ( last != null ) { + if ( ce.getOperator() == last.getOperator() ) + return null; + if ( isIndel(ce) && isIndel(last) ) + return null; + } + + cigar += ce.getLength() + ce.getOperator().toString(); + len += ce.getLength(); + last = ce; + hasMatch = hasMatch || ce.getOperator() == CigarOperator.M; + } + + if ( ! hasMatch && elements.size() == 1 && + ! (last.getOperator() == CigarOperator.I || last.getOperator() == CigarOperator.S)) + return null; + + return new LIBSTest(cigar); + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTests(final List cigarLengths, final List combinations) { + final List tests = new LinkedList(); + + final List allOps = Arrays.asList(CigarOperator.values()); + + final List singleCigars = new LinkedList(); + for ( final int len : cigarLengths ) + for ( final CigarOperator op : allOps ) + singleCigars.add(new CigarElement(len, op)); + + for ( final int complexity : combinations ) { + for ( final List elements : Utils.makePermutations(singleCigars, complexity, true) ) { + final LIBSTest test = makePermutationTest(elements); + if ( test != null ) tests.add(new Object[]{test}); + } + } + + return tests.toArray(new Object[][]{}); + } + + /** + * Work around inadequate tests that aren't worth fixing. + * + * Look at the CIGAR 2M2P2D2P2M. Both M states border a deletion, separated by P (padding elements). So + * the right answer for deletions here is true for isBeforeDeletion() and isAfterDeletion() for the first + * and second M. But the LIBS_position doesn't say so. + * + * @param elements + * @return + */ + protected static boolean hasNeighboringPaddedOps(final List elements, final int elementI) { + return (elementI - 1 >= 0 && isPadding(elements.get(elementI-1))) || + (elementI + 1 < elements.size() && isPadding(elements.get(elementI+1))); + } + + private static boolean isPadding(final CigarElement elt) { + return elt.getOperator() == CigarOperator.P || elt.getOperator() == CigarOperator.H || elt.getOperator() == CigarOperator.S; + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java new file mode 100644 index 000000000..cbbdf3609 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/LocusIteratorByStateUnitTest.java @@ -0,0 +1,743 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.locusiterator; + +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.downsampling.DownsampleType; +import org.broadinstitute.gatk.utils.downsampling.DownsamplingMethod; +import org.broadinstitute.gatk.utils.NGSPlatform; +import org.broadinstitute.gatk.utils.QualityUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.pileup.PileupElement; +import org.broadinstitute.gatk.utils.pileup.ReadBackedPileup; +import org.broadinstitute.gatk.utils.sam.ArtificialBAMBuilder; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.gatk.utils.sam.GATKSAMReadGroupRecord; +import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * testing of the new (non-legacy) version of LocusIteratorByState + */ +public class LocusIteratorByStateUnitTest extends LocusIteratorByStateBaseTest { + private static final boolean DEBUG = false; + protected LocusIteratorByState li; + + @Test(enabled = !DEBUG) + public void testUnmappedAndAllIReadsPassThrough() { + final int readLength = 10; + GATKSAMRecord mapped1 = ArtificialSAMUtils.createArtificialRead(header,"mapped1",0,1,readLength); + GATKSAMRecord mapped2 = ArtificialSAMUtils.createArtificialRead(header,"mapped2",0,1,readLength); + GATKSAMRecord unmapped = ArtificialSAMUtils.createArtificialRead(header,"unmapped",0,1,readLength); + GATKSAMRecord allI = ArtificialSAMUtils.createArtificialRead(header,"allI",0,1,readLength); + + unmapped.setReadUnmappedFlag(true); + unmapped.setCigarString("*"); + allI.setCigarString(readLength + "I"); + + List reads = Arrays.asList(mapped1, unmapped, allI, mapped2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, DownsamplingMethod.NONE, true); + + Assert.assertTrue(li.hasNext()); + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 2, "Should see only 2 reads in pileup, even with unmapped and all I reads"); + + final List rawReads = li.transferReadsFromAllPreviousPileups(); + Assert.assertEquals(rawReads, reads, "Input and transferred read lists should be the same, and include the unmapped and all I reads"); + } + + @Test(enabled = true && ! DEBUG) + public void testXandEQOperators() { + final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; + + GATKSAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); + r1.setReadBases(bases1); + r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r1.setCigarString("10M"); + + GATKSAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); + r2.setReadBases(bases2); + r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r2.setCigarString("3=1X5=1X"); + + GATKSAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); + r3.setReadBases(bases2); + r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r3.setCigarString("3=1X5M1X"); + + GATKSAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); + r4.setReadBases(bases2); + r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r4.setCigarString("10M"); + + List reads = Arrays.asList(r1, r2, r3, r4); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads); + + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 4); + } + } + + @Test(enabled = true && ! DEBUG) + public void testIndelsInRegularPileup() { + final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; + + GATKSAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); + before.setReadBases(bases); + before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + before.setCigarString("10M"); + + GATKSAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); + during.setReadBases(indelBases); + during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + during.setCigarString("4M2I6M"); + + GATKSAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); + after.setReadBases(bases); + after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + after.setCigarString("10M"); + + List reads = Arrays.asList(before, during, after); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads); + + boolean foundIndel = false; + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); + for (PileupElement p : pileup) { + if (p.isBeforeInsertion()) { + foundIndel = true; + Assert.assertEquals(p.getLengthOfImmediatelyFollowingIndel(), 2, "Wrong event length"); + Assert.assertEquals(p.getBasesOfImmediatelyFollowingInsertion(), "CT", "Inserted bases are incorrect"); + break; + } + } + + } + + Assert.assertTrue(foundIndel,"Indel in pileup not found"); + } + + @Test(enabled = false && ! DEBUG) + public void testWholeIndelReadInIsolation() { + final int firstLocus = 44367789; + + GATKSAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); + indelOnlyRead.setCigarString("76I"); + + List reads = Arrays.asList(indelOnlyRead); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads); + + // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read + // and considers it to be an indel-containing read. + Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); + ReadBackedPileup basePileup = alignmentContext.getBasePileup(); + Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test(enabled = true && ! DEBUG) + public void testWholeIndelRead() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + GATKSAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); + leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); + leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + leadingRead.setCigarString("1M75I"); + + GATKSAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setCigarString("76I"); + + GATKSAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); + fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); + fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); + fullMatchAfterIndel.setCigarString("75I1M"); + + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, null, false); + int currentLocus = firstLocus; + int numAlignmentContextsFound = 0; + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); + + if(currentLocus == firstLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); + } + else if(currentLocus == secondLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); + } + + currentLocus++; + numAlignmentContextsFound++; + } + + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly + */ + @Test(enabled = false && ! DEBUG) + public void testWholeIndelReadRepresentedTest() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); + + List reads = Arrays.asList(read1); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, null, false); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + // TODO -- fix tests +// PileupElement pe = p.iterator().next(); +// Assert.assertTrue(pe.isBeforeInsertion()); +// Assert.assertFalse(pe.isAfterInsertion()); +// Assert.assertEquals(pe.getBasesOfImmediatelyFollowingInsertion(), "A"); + } + + GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, null, false); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + // TODO -- fix tests +// PileupElement pe = p.iterator().next(); +// Assert.assertTrue(pe.isBeforeInsertion()); +// Assert.assertFalse(pe.isAfterInsertion()); +// Assert.assertEquals(pe.getBasesOfImmediatelyFollowingInsertion(), "AAAAAAAAAA"); + } + } + + + ///////////////////////////////////////////// + // get event length and bases calculations // + ///////////////////////////////////////////// + + @DataProvider(name = "IndelLengthAndBasesTest") + public Object[][] makeIndelLengthAndBasesTest() { + final String EVENT_BASES = "ACGTACGTACGT"; + final List tests = new LinkedList(); + + for ( int eventSize = 1; eventSize < 10; eventSize++ ) { + for ( final CigarOperator indel : Arrays.asList(CigarOperator.D, CigarOperator.I) ) { + final String cigar = String.format("2M%d%s1M", eventSize, indel.toString()); + final String eventBases = indel == CigarOperator.D ? "" : EVENT_BASES.substring(0, eventSize); + final int readLength = 3 + eventBases.length(); + + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, readLength); + read.setReadBases(("TT" + eventBases + "A").getBytes()); + final byte[] quals = new byte[readLength]; + for ( int i = 0; i < readLength; i++ ) + quals[i] = (byte)(i % QualityUtils.MAX_SAM_QUAL_SCORE); + read.setBaseQualities(quals); + read.setCigarString(cigar); + + tests.add(new Object[]{read, indel, eventSize, eventBases.equals("") ? null : eventBases}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "IndelLengthAndBasesTest") + public void testIndelLengthAndBasesTest(GATKSAMRecord read, final CigarOperator op, final int eventSize, final String eventBases) { + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList((GATKSAMRecord)read), null, false); + + Assert.assertTrue(li.hasNext()); + + final PileupElement firstMatch = getFirstPileupElement(li.next()); + + Assert.assertEquals(firstMatch.getLengthOfImmediatelyFollowingIndel(), 0, "Length != 0 for site not adjacent to indel"); + Assert.assertEquals(firstMatch.getBasesOfImmediatelyFollowingInsertion(), null, "Getbases of following event should be null at non-adajenct event"); + + Assert.assertTrue(li.hasNext()); + + final PileupElement pe = getFirstPileupElement(li.next()); + + if ( op == CigarOperator.D ) + Assert.assertTrue(pe.isBeforeDeletionStart()); + else + Assert.assertTrue(pe.isBeforeInsertion()); + + Assert.assertEquals(pe.getLengthOfImmediatelyFollowingIndel(), eventSize, "Length of event failed"); + Assert.assertEquals(pe.getBasesOfImmediatelyFollowingInsertion(), eventBases, "Getbases of following event failed"); + } + + private PileupElement getFirstPileupElement(final AlignmentContext context) { + final ReadBackedPileup p = context.getBasePileup(); + Assert.assertEquals(p.getNumberOfElements(), 1); + return p.iterator().next(); + } + + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + @DataProvider(name = "MyLIBSTest") + public Object[][] makeLIBSTest() { + final List tests = new LinkedList(); + +// tests.add(new Object[]{new LIBSTest("2=2D2=2X", 1)}); +// return tests.toArray(new Object[][]{}); + + return createLIBSTests( + Arrays.asList(1, 2), + Arrays.asList(1, 2, 3, 4)); + +// return createLIBSTests( +// Arrays.asList(2), +// Arrays.asList(3)); + } + + @Test(enabled = ! DEBUG, dataProvider = "MyLIBSTest") + public void testLIBS(LIBSTest params) { + // create the iterator by state with the fake reads and fake records + final GATKSAMRecord read = params.makeRead(); + li = makeLTBS(Arrays.asList((GATKSAMRecord)read), null, false); + final LIBS_position tester = new LIBS_position(read); + + int bpVisited = 0; + int lastOffset = 0; + while ( li.hasNext() ) { + bpVisited++; + + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertEquals(p.getNumberOfElements(), 1); + PileupElement pe = p.iterator().next(); + + Assert.assertEquals(p.getNumberOfDeletions(), pe.isDeletion() ? 1 : 0, "wrong number of deletions in the pileup"); + Assert.assertEquals(p.getNumberOfMappingQualityZeroReads(), pe.getRead().getMappingQuality() == 0 ? 1 : 0, "wront number of mapq reads in the pileup"); + + tester.stepForwardOnGenome(); + + if ( ! hasNeighboringPaddedOps(params.getElements(), pe.getCurrentCigarOffset()) ) { + Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart, "before deletion start failure"); + Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd, "after deletion end failure"); + } + + Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion, "before insertion failure"); + Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion, "after insertion failure"); + Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip, "next to soft clip failure"); + + Assert.assertTrue(pe.getOffset() >= lastOffset, "Somehow read offsets are decreasing: lastOffset " + lastOffset + " current " + pe.getOffset()); + Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset(), "Read offsets are wrong at " + bpVisited); + + Assert.assertEquals(pe.getCurrentCigarElement(), read.getCigar().getCigarElement(tester.currentOperatorIndex), "CigarElement index failure"); + Assert.assertEquals(pe.getOffsetInCurrentCigar(), tester.getCurrentPositionOnOperatorBase0(), "CigarElement index failure"); + + Assert.assertEquals(read.getCigar().getCigarElement(pe.getCurrentCigarOffset()), pe.getCurrentCigarElement(), "Current cigar element isn't what we'd get from the read itself"); + + Assert.assertTrue(pe.getOffsetInCurrentCigar() >= 0, "Offset into current cigar too small"); + Assert.assertTrue(pe.getOffsetInCurrentCigar() < pe.getCurrentCigarElement().getLength(), "Offset into current cigar too big"); + + Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset(), "Read offset failure"); + lastOffset = pe.getOffset(); + } + + final int expectedBpToVisit = read.getAlignmentEnd() - read.getAlignmentStart() + 1; + Assert.assertEquals(bpVisited, expectedBpToVisit, "Didn't visit the expected number of bp"); + } + + // ------------------------------------------------------------ + // + // Tests for keeping reads + // + // ------------------------------------------------------------ + + @DataProvider(name = "LIBS_ComplexPileupTests") + public Object[][] makeLIBS_ComplexPileupTests() { + final List tests = new LinkedList(); + + for ( final int downsampleTo : Arrays.asList(-1, 1, 2, 5, 10, 30)) { + for ( final int nReadsPerLocus : Arrays.asList(1, 10, 60) ) { + for ( final int nLoci : Arrays.asList(1, 10, 25) ) { + for ( final int nSamples : Arrays.asList(1, 2, 10) ) { + for ( final boolean keepReads : Arrays.asList(true, false) ) { + for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true, false) ) { +// for ( final int downsampleTo : Arrays.asList(1)) { +// for ( final int nReadsPerLocus : Arrays.asList(1) ) { +// for ( final int nLoci : Arrays.asList(1) ) { +// for ( final int nSamples : Arrays.asList(1) ) { +// for ( final boolean keepReads : Arrays.asList(true) ) { +// for ( final boolean grabReadsAfterEachCycle : Arrays.asList(true) ) { + tests.add(new Object[]{nReadsPerLocus, nLoci, nSamples, + keepReads, grabReadsAfterEachCycle, + downsampleTo}); + } + } + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_ComplexPileupTests") + public void testLIBS_ComplexPileupTests(final int nReadsPerLocus, + final int nLoci, + final int nSamples, + final boolean keepReads, + final boolean grabReadsAfterEachCycle, + final int downsampleTo) { + //logger.warn(String.format("testLIBSKeepSubmittedReads %d %d %d %b %b %b", nReadsPerLocus, nLoci, nSamples, keepReads, grabReadsAfterEachCycle, downsample)); + final int readLength = 10; + + final boolean downsample = downsampleTo != -1; + final DownsamplingMethod downsampler = downsample + ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null) + : new DownsamplingMethod(DownsampleType.NONE, null, null); + + final ArtificialBAMBuilder bamBuilder = new ArtificialBAMBuilder(header.getSequenceDictionary(), nReadsPerLocus, nLoci); + bamBuilder.createAndSetHeader(nSamples).setReadLength(readLength).setAlignmentStart(1); + + final List reads = bamBuilder.makeReads(); + li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), + downsampler, true, keepReads, + genomeLocParser, + bamBuilder.getSamples()); + + final Set seenSoFar = new HashSet(); + final Set keptReads = new HashSet(); + int bpVisited = 0; + while ( li.hasNext() ) { + bpVisited++; + final AlignmentContext alignmentContext = li.next(); + final ReadBackedPileup p = alignmentContext.getBasePileup(); + + AssertWellOrderedPileup(p); + + if ( downsample ) { + // just not a safe test + //Assert.assertTrue(p.getNumberOfElements() <= maxDownsampledCoverage * nSamples, "Too many reads at locus after downsampling"); + } else { + final int minPileupSize = nReadsPerLocus * nSamples; + Assert.assertTrue(p.getNumberOfElements() >= minPileupSize); + } + + // the number of reads starting here + int nReadsStartingHere = 0; + for ( final GATKSAMRecord read : p.getReads() ) + if ( read.getAlignmentStart() == alignmentContext.getPosition() ) + nReadsStartingHere++; + + // we can have no more than maxDownsampledCoverage per sample + final int maxCoveragePerLocus = downsample ? downsampleTo : nReadsPerLocus; + Assert.assertTrue(nReadsStartingHere <= maxCoveragePerLocus * nSamples); + + seenSoFar.addAll(p.getReads()); + if ( keepReads && grabReadsAfterEachCycle ) { + final List locusReads = li.transferReadsFromAllPreviousPileups(); + + + if ( downsample ) { + // with downsampling we might have some reads here that were downsampled away + // in the pileup. We want to ensure that no more than the max coverage per sample is added + Assert.assertTrue(locusReads.size() >= nReadsStartingHere); + Assert.assertTrue(locusReads.size() <= maxCoveragePerLocus * nSamples); + } else { + Assert.assertEquals(locusReads.size(), nReadsStartingHere); + } + keptReads.addAll(locusReads); + + // check that all reads we've seen so far are in our keptReads + for ( final GATKSAMRecord read : seenSoFar ) { + Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read); + } + } + + if ( ! keepReads ) + Assert.assertTrue(li.getReadsFromAllPreviousPileups().isEmpty(), "Not keeping reads but the underlying list of reads isn't empty"); + } + + if ( keepReads && ! grabReadsAfterEachCycle ) + keptReads.addAll(li.transferReadsFromAllPreviousPileups()); + + if ( ! downsample ) { // downsampling may drop loci + final int expectedBpToVisit = nLoci + readLength - 1; + Assert.assertEquals(bpVisited, expectedBpToVisit, "Didn't visit the expected number of bp"); + } + + if ( keepReads ) { + // check we have the right number of reads + final int totalReads = nLoci * nReadsPerLocus * nSamples; + if ( ! downsample ) { // downsampling may drop reads + Assert.assertEquals(keptReads.size(), totalReads, "LIBS didn't keep the right number of reads during the traversal"); + + // check that the order of reads is the same as in our read list + for ( int i = 0; i < reads.size(); i++ ) { + final GATKSAMRecord inputRead = reads.get(i); + final GATKSAMRecord keptRead = reads.get(i); + Assert.assertSame(keptRead, inputRead, "Input reads and kept reads differ at position " + i); + } + } else { + Assert.assertTrue(keptReads.size() <= totalReads, "LIBS didn't keep the right number of reads during the traversal"); + } + + // check uniqueness + final Set readNames = new HashSet(); + for ( final GATKSAMRecord read : keptReads ) { + Assert.assertFalse(readNames.contains(read.getReadName()), "Found duplicate reads in the kept reads"); + readNames.add(read.getReadName()); + } + + // check that all reads we've seen are in our keptReads + for ( final GATKSAMRecord read : seenSoFar ) { + Assert.assertTrue(keptReads.contains(read), "A read that appeared in a pileup wasn't found in the kept reads: " + read); + } + + if ( ! downsample ) { + // check that every read in the list of keep reads occurred at least once in one of the pileups + for ( final GATKSAMRecord keptRead : keptReads ) { + Assert.assertTrue(seenSoFar.contains(keptRead), "There's a read " + keptRead + " in our keptReads list that never appeared in any pileup"); + } + } + } + } + + private void AssertWellOrderedPileup(final ReadBackedPileup pileup) { + if ( ! pileup.isEmpty() ) { + int leftMostPos = -1; + + for ( final PileupElement pe : pileup ) { + Assert.assertTrue(pileup.getLocation().getContig().equals(pe.getRead().getReferenceName()), "ReadBackedPileup contains an element " + pe + " that's on a different contig than the pileup itself"); + Assert.assertTrue(pe.getRead().getAlignmentStart() >= leftMostPos, + "ReadBackedPileup contains an element " + pe + " whose read's alignment start " + pe.getRead().getAlignmentStart() + + " occurs before the leftmost position we've seen previously " + leftMostPos); + } + } + } + + // --------------------------------------------------------------------------- + // make sure that downsampling isn't holding onto a bazillion reads + // + @DataProvider(name = "LIBS_NotHoldingTooManyReads") + public Object[][] makeLIBS_NotHoldingTooManyReads() { + final List tests = new LinkedList(); + + for ( final int downsampleTo : Arrays.asList(1, 10)) { + for ( final int nReadsPerLocus : Arrays.asList(100, 1000, 10000, 100000) ) { + for ( final int payloadInBytes : Arrays.asList(0, 1024, 1024*1024) ) { + tests.add(new Object[]{nReadsPerLocus, downsampleTo, payloadInBytes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "LIBS_NotHoldingTooManyReads") +// @Test(enabled = true, dataProvider = "LIBS_NotHoldingTooManyReads", timeOut = 100000) + public void testLIBS_NotHoldingTooManyReads(final int nReadsPerLocus, final int downsampleTo, final int payloadInBytes) { + logger.warn(String.format("testLIBS_NotHoldingTooManyReads %d %d %d", nReadsPerLocus, downsampleTo, payloadInBytes)); + final int readLength = 10; + + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100000); + final int nSamples = 1; + final List samples = new ArrayList(nSamples); + for ( int i = 0; i < nSamples; i++ ) { + final GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord("rg" + i); + final String sample = "sample" + i; + samples.add(sample); + rg.setSample(sample); + rg.setPlatform(NGSPlatform.ILLUMINA.getDefaultPlatform()); + header.addReadGroup(rg); + } + + final boolean downsample = downsampleTo != -1; + final DownsamplingMethod downsampler = downsample + ? new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsampleTo, null) + : new DownsamplingMethod(DownsampleType.NONE, null, null); + + // final List reads = ArtificialSAMUtils.createReadStream(nReadsPerLocus, nLoci, header, 1, readLength); + + final WeakReadTrackingIterator iterator = new WeakReadTrackingIterator(nReadsPerLocus, readLength, payloadInBytes, header); + + li = new LocusIteratorByState(iterator, + downsampler, true, false, + genomeLocParser, + samples); + + while ( li.hasNext() ) { + final AlignmentContext next = li.next(); + Assert.assertTrue(next.getBasePileup().getNumberOfElements() <= downsampleTo, "Too many elements in pileup " + next); + // TODO -- assert that there are <= X reads in memory after GC for some X + } + } + + private static class WeakReadTrackingIterator implements Iterator { + final int nReads, readLength, payloadInBytes; + int readI = 0; + final SAMFileHeader header; + + private WeakReadTrackingIterator(int nReads, int readLength, final int payloadInBytes, final SAMFileHeader header) { + this.nReads = nReads; + this.readLength = readLength; + this.header = header; + this.payloadInBytes = payloadInBytes; + } + + @Override public boolean hasNext() { return readI < nReads; } + @Override public void remove() { throw new UnsupportedOperationException("no remove"); } + + @Override + public GATKSAMRecord next() { + readI++; + return makeRead(); + } + + private GATKSAMRecord makeRead() { + final SAMReadGroupRecord rg = header.getReadGroups().get(0); + final String readName = String.format("%s.%d.%s", "read", readI, rg.getId()); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, readName, 0, 1, readLength); + read.setReadGroup(new GATKSAMReadGroupRecord(rg)); + if ( payloadInBytes > 0 ) + // add a payload byte array to push memory use per read even higher + read.setAttribute("PL", new byte[payloadInBytes]); + return read; + } + } + + // --------------------------------------------------------------------------- + // + // make sure that adapter clipping is working properly in LIBS + // + // --------------------------------------------------------------------------- + @DataProvider(name = "AdapterClippingTest") + public Object[][] makeAdapterClippingTest() { + final List tests = new LinkedList(); + + final int start = 10; + for ( final int goodBases : Arrays.asList(10, 20, 30) ) { + for ( final int nClips : Arrays.asList(0, 1, 2, 10)) { + for ( final boolean onLeft : Arrays.asList(true, false) ) { + final int readLength = nClips + goodBases; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1" , 0, start, readLength); + read.setProperPairFlag(true); + read.setReadPairedFlag(true); + read.setReadUnmappedFlag(false); + read.setMateUnmappedFlag(false); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', readLength)); + read.setCigarString(readLength + "M"); + + if ( onLeft ) { + read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); + read.setMateAlignmentStart(start + nClips); + read.setInferredInsertSize(readLength); + tests.add(new Object[]{nClips, goodBases, 0, read}); + } else { + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + read.setMateAlignmentStart(start - 1); + read.setInferredInsertSize(goodBases - 1); + tests.add(new Object[]{0, goodBases, nClips, read}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "AdapterClippingTest") + public void testAdapterClipping(final int nClipsOnLeft, final int nReadContainingPileups, final int nClipsOnRight, final GATKSAMRecord read) { + + li = new LocusIteratorByState(new FakeCloseableIterator<>(Collections.singletonList(read).iterator()), + DownsamplingMethod.NONE, true, false, + genomeLocParser, + LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + + int expectedPos = read.getAlignmentStart() + nClipsOnLeft; + int nPileups = 0; + while ( li.hasNext() ) { + final AlignmentContext next = li.next(); + Assert.assertEquals(next.getLocation().getStart(), expectedPos); + nPileups++; + expectedPos++; + } + + final int nExpectedPileups = nReadContainingPileups; + Assert.assertEquals(nPileups, nExpectedPileups, "Wrong number of pileups seen"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManagerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManagerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManagerUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/locusiterator/PerSampleReadStateManagerUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducerUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/InputProducerUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/MapResultUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/NanoSchedulerUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/ReducerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/ReducerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/ReducerUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/nanoScheduler/ReducerUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/pileup/PileupElementUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/pileup/PileupElementUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/pileup/PileupElementUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/pileup/PileupElementUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/pileup/ReadBackedPileupUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemonUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemonUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDaemonUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDataUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDataUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDataUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/progressmeter/ProgressMeterDataUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/recalibration/EventTypeUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/recalibration/EventTypeUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/recalibration/EventTypeUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/recalibration/EventTypeUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/RefMetaDataTrackerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/RefMetaDataTrackerUnitTest.java new file mode 100644 index 000000000..62a704217 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/RefMetaDataTrackerUnitTest.java @@ -0,0 +1,290 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata; + +import htsjdk.samtools.SAMFileHeader; +import org.apache.log4j.Logger; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.RodBinding; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.codecs.table.TableFeature; +import org.broadinstitute.gatk.utils.refdata.utils.GATKFeature; +import org.broadinstitute.gatk.utils.refdata.utils.RODRecordList; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import org.testng.Assert; +import org.testng.annotations.*; +import java.util.*; +import java.util.List; + +public class RefMetaDataTrackerUnitTest { + final protected static Logger logger = Logger.getLogger(RefMetaDataTrackerUnitTest.class); + private static SAMFileHeader header; + private ReferenceContext context; + private GenomeLocParser genomeLocParser; + private GenomeLoc locus; + private final static int START_POS = 10; + Allele A,C,G,T; + VariantContext AC_SNP, AG_SNP, AT_SNP; + TableFeature span10_10, span1_20, span10_20; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 100); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + locus = genomeLocParser.createGenomeLoc("chr1", START_POS, START_POS); + context = new ReferenceContext(genomeLocParser, locus, (byte)'A'); + A = Allele.create("A", true); + C = Allele.create("C"); + G = Allele.create("G"); + T = Allele.create("T"); + AC_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, C)).make(); + AG_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, G)).make(); + AT_SNP = new VariantContextBuilder("x", "chr1", START_POS, START_POS, Arrays.asList(A, T)).make(); + span10_10 = makeSpan(10, 10); + span1_20 = makeSpan(1, 20); + span10_20 = makeSpan(10, 20); + } + + @BeforeMethod + public void reset() { + RodBinding.resetNameCounter(); + } + + private class MyTest extends BaseTest.TestDataProvider { + public RODRecordList AValues, BValues; + + private MyTest(Class c, final List AValues, final List BValues) { + super(c); + this.AValues = AValues == null ? null : makeRODRecord("A", AValues); + this.BValues = BValues == null ? null : makeRODRecord("B", BValues); + } + + private MyTest(final List AValues, final List BValues) { + super(MyTest.class); + this.AValues = AValues == null ? null : makeRODRecord("A", AValues); + this.BValues = BValues == null ? null : makeRODRecord("B", BValues); + } + + @Override + public String toString() { + return String.format("A=%s, B=%s", AValues, BValues); + } + + private final RODRecordList makeRODRecord(String name, List features) { + List x = new ArrayList(); + for ( Feature f : features ) + x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); + return new RODRecordListImpl(name, x, locus); + } + + public List expected(String name) { + if ( name.equals("A+B") ) return allValues(); + if ( name.equals("A") ) return expectedAValues(); + if ( name.equals("B") ) return expectedBValues(); + throw new RuntimeException("FAIL"); + } + + public List allValues() { + List x = new ArrayList(); + x.addAll(expectedAValues()); + x.addAll(expectedBValues()); + return x; + } + + public List expectedAValues() { + return AValues == null ? Collections.emptyList() : AValues; + } + + public List expectedBValues() { + return BValues == null ? Collections.emptyList() : BValues; + } + + public RefMetaDataTracker makeTracker() { + List x = new ArrayList(); + if ( AValues != null ) x.add(AValues); + if ( BValues != null ) x.add(BValues); + return new RefMetaDataTracker(x); + } + + public int nBoundTracks() { + int n = 0; + if ( AValues != null ) n++; + if ( BValues != null ) n++; + return n; + } + } + + private final TableFeature makeSpan(int start, int stop) { + return new TableFeature(genomeLocParser.createGenomeLoc("chr1", start, stop), + Collections.emptyList(), Collections.emptyList()); + } + + @DataProvider(name = "tests") + public Object[][] createTests() { + new MyTest(null, null); + new MyTest(Arrays.asList(AC_SNP), null); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), null); + new MyTest(Arrays.asList(AC_SNP), Arrays.asList(AG_SNP)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(AG_SNP)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10, span10_20)); + new MyTest(Arrays.asList(AC_SNP, AT_SNP), Arrays.asList(span10_10, span10_20, span1_20)); + + // for requires starts + new MyTest(Arrays.asList(span1_20), null); + new MyTest(Arrays.asList(span10_10, span10_20), null); + new MyTest(Arrays.asList(span10_10, span10_20, span1_20), null); + + return MyTest.getTests(MyTest.class); + } + + @Test(enabled = true, dataProvider = "tests") + public void testRawBindings(MyTest test) { + logger.warn("Testing " + test + " for number of bound tracks"); + RefMetaDataTracker tracker = test.makeTracker(); + Assert.assertEquals(tracker.getNTracksWithBoundFeatures(), test.nBoundTracks()); + + testSimpleBindings("A", tracker, test.AValues); + testSimpleBindings("B", tracker, test.BValues); + } + + private void testSimpleBindings(String name, RefMetaDataTracker tracker, RODRecordList expected) { + List asValues = tracker.getValues(Feature.class, name); + + Assert.assertEquals(tracker.hasValues(name), expected != null); + Assert.assertEquals(asValues.size(), expected == null ? 0 : expected.size()); + + if ( expected != null ) { + for ( GATKFeature e : expected ) { + boolean foundValue = false; + for ( Feature f : asValues ) { + if ( e.getUnderlyingObject() == f ) foundValue = true; + } + Assert.assertTrue(foundValue, "Never found expected value of " + e.getUnderlyingObject() + " bound to " + name + " in " + tracker); + } + } + } + + @Test(enabled = true, dataProvider = "tests") + public void testGettersAsString(MyTest test) { + logger.warn("Testing " + test + " for get() methods"); + RefMetaDataTracker tracker = test.makeTracker(); + + for ( String name : Arrays.asList("A+B", "A", "B") ) { + List v1 = name.equals("A+B") ? tracker.getValues(Feature.class) : tracker.getValues(Feature.class, name); + testGetter(name, v1, test.expected(name), true, tracker); + + List v2 = name.equals("A+B") ? tracker.getValues(Feature.class, locus) : tracker.getValues(Feature.class, name, locus); + testGetter(name, v2, startingHere(test.expected(name)), true, tracker); + + Feature v3 = name.equals("A+B") ? tracker.getFirstValue(Feature.class) : tracker.getFirstValue(Feature.class, name); + testGetter(name, Arrays.asList(v3), test.expected(name), false, tracker); + + Feature v4 = name.equals("A+B") ? tracker.getFirstValue(Feature.class, locus) : tracker.getFirstValue(Feature.class, name, locus); + testGetter(name, Arrays.asList(v4), startingHere(test.expected(name)), false, tracker); + } + } + + @Test(enabled = true, dataProvider = "tests") + public void testGettersAsRodBindings(MyTest test) { + logger.warn("Testing " + test + " for get() methods as RodBindings"); + RefMetaDataTracker tracker = test.makeTracker(); + + for ( String nameAsString : Arrays.asList("A", "B") ) { + RodBinding binding = new RodBinding(Feature.class, nameAsString, "none", "vcf", new Tags()); + List v1 = tracker.getValues(binding); + testGetter(nameAsString, v1, test.expected(nameAsString), true, tracker); + + List v2 = tracker.getValues(binding, locus); + testGetter(nameAsString, v2, startingHere(test.expected(nameAsString)), true, tracker); + + Feature v3 = tracker.getFirstValue(binding); + testGetter(nameAsString, Arrays.asList(v3), test.expected(nameAsString), false, tracker); + + Feature v4 = tracker.getFirstValue(binding, locus); + testGetter(nameAsString, Arrays.asList(v4), startingHere(test.expected(nameAsString)), false, tracker); + } + } + + @Test(enabled = true, dataProvider = "tests") + public void testGettersAsListOfRodBindings(MyTest test) { + logger.warn("Testing " + test + " for get() methods for List"); + RefMetaDataTracker tracker = test.makeTracker(); + + String nameAsString = "A+B"; + RodBinding A = new RodBinding(Feature.class, "A", "none", "vcf", new Tags()); + RodBinding B = new RodBinding(Feature.class, "B", "none", "vcf", new Tags()); + List> binding = Arrays.asList(A, B); + + List v1 = tracker.getValues(binding); + testGetter(nameAsString, v1, test.expected(nameAsString), true, tracker); + + List v2 = tracker.getValues(binding, locus); + testGetter(nameAsString, v2, startingHere(test.expected(nameAsString)), true, tracker); + + Feature v3 = tracker.getFirstValue(binding); + testGetter(nameAsString, Arrays.asList(v3), test.expected(nameAsString), false, tracker); + + Feature v4 = tracker.getFirstValue(binding, locus); + testGetter(nameAsString, Arrays.asList(v4), startingHere(test.expected(nameAsString)), false, tracker); + } + + private List startingHere(List l) { + List x = new ArrayList(); + for ( GATKFeature f : l ) if ( f.getStart() == locus.getStart() ) x.add(f); + return x; + } + + private void testGetter(String name, List got, List expected, boolean requireExact, RefMetaDataTracker tracker) { + if ( got.size() == 1 && got.get(0) == null ) + got = Collections.emptyList(); + + if ( requireExact ) + Assert.assertEquals(got.size(), expected.size()); + + boolean foundAny = false; + for ( GATKFeature e : expected ) { + boolean found1 = false; + for ( Feature got1 : got ) { + if ( e.getUnderlyingObject() == got1 ) + found1 = true; + } + if ( requireExact ) + Assert.assertTrue(found1, "Never found expected GATKFeature " + e + " bound to " + name + " in " + tracker); + foundAny = found1 || foundAny; + } + + if ( ! requireExact && ! expected.isEmpty() ) + Assert.assertTrue(foundAny, "Never found any got values matching one of the expected values bound to " + name + " in " + tracker); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java new file mode 100644 index 000000000..b0805e161 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/FeatureManagerUnitTest.java @@ -0,0 +1,163 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.tracks; + + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.codecs.table.BedTableCodec; +import org.broadinstitute.gatk.utils.codecs.table.TableFeature; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import htsjdk.variant.vcf.VCF3Codec; +import htsjdk.variant.vcf.VCFCodec; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import htsjdk.variant.variantcontext.VariantContext; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.*; +import java.util.*; + + +/** + * @author depristo + * + * UnitTests for RMD FeatureManager + */ +public class FeatureManagerUnitTest extends BaseTest { + private static final File RANDOM_FILE = new File(publicTestDir+ "exampleGATKReport.eval"); + private static final File VCF3_FILE = new File(privateTestDir + "vcf3.vcf"); + private static final File VCF4_FILE = new File(privateTestDir + "HiSeq.10000.vcf"); + private static final File VCF4_FILE_GZ = new File(privateTestDir + "HiSeq.10000.vcf.gz"); + private static final File VCF4_FILE_BGZIP = new File(privateTestDir + "HiSeq.10000.bgzip.vcf.gz"); + + private FeatureManager manager; + private GenomeLocParser genomeLocParser; + + @BeforeMethod + public void setup() { + File referenceFile = new File(b36KGReference); + try { + IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + genomeLocParser = new GenomeLocParser(seq); + manager = new FeatureManager(); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(referenceFile,ex); + } + } + + @Test + public void testManagerCreation() { + Assert.assertTrue(manager.getFeatureDescriptors().size() > 0); + } + + private class FMTest extends BaseTest.TestDataProvider { + public Class codec; + public Class feature; + public String name; + public File associatedFile; + + private FMTest(final Class feature, final Class codec, final String name, final File file) { + super(FMTest.class); + this.codec = codec; + this.feature = feature; + this.name = name; + this.associatedFile = file; + } + + public void assertExpected(FeatureManager.FeatureDescriptor featureDescriptor) { + Assert.assertEquals(featureDescriptor.getCodecClass(), codec); + Assert.assertEquals(featureDescriptor.getFeatureClass(), feature); + Assert.assertEquals(featureDescriptor.getName().toLowerCase(), name.toLowerCase()); + } + + public String toString() { + return String.format("FMTest name=%s codec=%s feature=%s file=%s", + name, codec.getSimpleName(), feature.getSimpleName(), associatedFile); + } + } + + @DataProvider(name = "tests") + public Object[][] createTests() { + new FMTest(VariantContext.class, VCF3Codec.class, "VCF3", VCF3_FILE); + new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE); + new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_GZ); + new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_BGZIP); + new FMTest(TableFeature.class, BedTableCodec.class, "bedtable", null); + return FMTest.getTests(FMTest.class); + } + + @Test(dataProvider = "tests") + public void testGetByFile(FMTest params) { + if ( params.associatedFile != null ) { + FeatureManager.FeatureDescriptor byFile = manager.getByFiletype(params.associatedFile); + Assert.assertNotNull(byFile, "Couldn't find any type associated with file " + params.associatedFile); + params.assertExpected(byFile); + } + } + + @Test + public void testGetByFileNoMatch() { + FeatureManager.FeatureDescriptor byFile = manager.getByFiletype(RANDOM_FILE); + Assert.assertNull(byFile, "Found type " + byFile + " associated with RANDOM, non-Tribble file " + RANDOM_FILE); + } + + @Test(dataProvider = "tests") + public void testGetters(FMTest params) { + params.assertExpected(manager.getByCodec(params.codec)); + params.assertExpected(manager.getByName(params.name)); + params.assertExpected(manager.getByName(params.name.toLowerCase())); + params.assertExpected(manager.getByName(params.name.toUpperCase())); + + Collection descriptors = manager.getByFeature(params.feature); + Assert.assertTrue(descriptors.size() > 0, "Look up by FeatureClass failed"); + } + + @Test + public void testUserFriendlyList() { + Assert.assertTrue(manager.userFriendlyListOfAvailableFeatures().length() > 0, "Expected at least one codec to be listed"); + Assert.assertTrue(manager.userFriendlyListOfAvailableFeatures().split(",").length > 0, "Expected at least two codecs, but only saw one"); + } + + @Test + public void testCodecCreation() { + FeatureManager.FeatureDescriptor descriptor = manager.getByName("vcf"); + Assert.assertNotNull(descriptor, "Couldn't find VCF feature descriptor!"); + + FeatureCodec c = manager.createCodec(descriptor, "foo", genomeLocParser, null); + Assert.assertNotNull(c, "Couldn't create codec"); + Assert.assertEquals(c.getClass(), descriptor.getCodecClass()); + Assert.assertEquals(c.getFeatureType(), descriptor.getFeatureClass()); + } + +} + diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java new file mode 100644 index 000000000..5019e78e7 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -0,0 +1,190 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.tracks; + + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.tribble.Tribble; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.util.LittleEndianOutputStream; +import htsjdk.variant.vcf.VCFCodec; +import org.broadinstitute.gatk.utils.exceptions.UserException; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; + +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.*; +import java.nio.channels.FileChannel; + + +/** + * @author aaron + *

+ * Class RMDTrackBuilderUnitTest + *

+ * Testing out the builder for tribble Tracks + */ +public class RMDTrackBuilderUnitTest extends BaseTest { + private RMDTrackBuilder builder; + private IndexedFastaSequenceFile seq; + private GenomeLocParser genomeLocParser; + + @BeforeMethod + public void setup() { + File referenceFile = new File(b37KGReference); + try { + seq = new CachingIndexedFastaSequenceFile(referenceFile); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(referenceFile,ex); + } + genomeLocParser = new GenomeLocParser(seq); + + // We have to disable auto-index creation/locking in the RMDTrackBuilder for tests, + // as the lock acquisition calls were intermittently hanging on our farm. This unfortunately + // means that we can't include tests for the auto-index creation feature. + builder = new RMDTrackBuilder(seq.getSequenceDictionary(),genomeLocParser,null,true,null); + } + + @Test + public void testBuilder() { + Assert.assertTrue(builder.getFeatureManager().getFeatureDescriptors().size() > 0); + } + + @Test + public void testDisableAutoIndexGeneration() throws IOException { + final File unindexedVCF = new File(privateTestDir + "unindexed.vcf"); + final File unindexedVCFIndex = Tribble.indexFile(unindexedVCF); + + Index index = builder.loadIndex(unindexedVCF, new VCFCodec()); + + Assert.assertFalse(unindexedVCFIndex.exists()); + Assert.assertNotNull(index); + } + + @Test + public void testLoadOnDiskIndex() { + final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); + final File tempVCFWithCorrectIndex = createTempVCFFileAndIndex(originalVCF, false); + final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithCorrectIndex); + + final Index index = builder.loadFromDisk(tempVCFWithCorrectIndex, tempVCFIndexFile); + + Assert.assertNotNull(index); + Assert.assertTrue(tempVCFIndexFile.exists()); + + final Index inMemoryIndex = builder.createIndexInMemory(tempVCFWithCorrectIndex, new VCFCodec()); + Assert.assertTrue(index.equalsIgnoreProperties(inMemoryIndex)); + } + + @Test + public void testLoadOnDiskOutdatedIndex() { + final File originalVCF = new File(privateTestDir + "vcf4.1.example.vcf"); + final File tempVCFWithOutdatedIndex = createTempVCFFileAndIndex(originalVCF, true); + final File tempVCFIndexFile = Tribble.indexFile(tempVCFWithOutdatedIndex); + + final Index index = builder.loadFromDisk(tempVCFWithOutdatedIndex, tempVCFIndexFile); + + // loadFromDisk() should return null to indicate that the index is outdated and should not be used, + // but should not delete the index since our builder has disableAutoIndexCreation set to true + Assert.assertNull(index); + Assert.assertTrue(tempVCFIndexFile.exists()); + } + + /** + * Create a temporary vcf file and an associated index file, which may be set to be out-of-date + * relative to the vcf + * + * @param vcfFile the vcf file + * @param createOutOfDateIndex if true, ensure that the temporary vcf file is modified after the index + * @return a file pointing to the new tmp location, with accompanying index + */ + private File createTempVCFFileAndIndex( final File vcfFile, final boolean createOutOfDateIndex ) { + try { + final File tmpFile = createTempFile("RMDTrackBuilderUnitTest", ""); + final File tmpIndex = Tribble.indexFile(tmpFile); + tmpIndex.deleteOnExit(); + + copyFile(vcfFile, tmpFile); + final Index inMemoryIndex = builder.createIndexInMemory(tmpFile, new VCFCodec()); + final LittleEndianOutputStream indexOutputStream = new LittleEndianOutputStream(new FileOutputStream(tmpIndex)); + + // If requested, modify the tribble file after the index. Otherwise, modify the index last. + if ( createOutOfDateIndex ) { + inMemoryIndex.write(indexOutputStream); + indexOutputStream.close(); + Thread.sleep(2000); + copyFile(vcfFile, tmpFile); + } + else { + copyFile(vcfFile, tmpFile); + Thread.sleep(2000); + inMemoryIndex.write(indexOutputStream); + indexOutputStream.close(); + } + + return tmpFile; + } catch (IOException e) { + Assert.fail("Unable to create temperary file"); + } catch (InterruptedException e) { + Assert.fail("Somehow our thread got interrupted"); + } + return null; + } + + /** + * copy a file, from http://www.exampledepot.com/egs/java.nio/File2File.html + * + * @param srFile the source file + * @param dtFile the destination file + */ + private static void copyFile(File srFile, File dtFile) { + try { + // Create channel on the source + FileChannel srcChannel = new FileInputStream(srFile).getChannel(); + + // Create channel on the destination + FileChannel dstChannel = new FileOutputStream(dtFile).getChannel(); + + // Copy file contents from source to destination + dstChannel.transferFrom(srcChannel, 0, srcChannel.size()); + + // Close the channels + srcChannel.close(); + dstChannel.close(); + } catch (IOException e) { + e.printStackTrace(); + Assert.fail("Unable to process copy " + e.getMessage()); + } + } + +} + diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/CheckableCloseableTribbleIterator.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/CheckableCloseableTribbleIterator.java new file mode 100644 index 000000000..cee60aace --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/CheckableCloseableTribbleIterator.java @@ -0,0 +1,90 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.tribble.CloseableTribbleIterator; +import htsjdk.tribble.Feature; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Adapter to allow checking if the wrapped iterator was closed. + * Creating an CCTI also adds it to the list returned from getThreadIterators(). + * @param feature + */ +public class CheckableCloseableTribbleIterator implements CloseableTribbleIterator { + private final CloseableTribbleIterator iterator; + private boolean closed = false; + + private static ThreadLocal>> threadIterators = + new ThreadLocal>>() { + @Override + protected List> initialValue() { + return new ArrayList>(); + } + }; + + public CheckableCloseableTribbleIterator(CloseableTribbleIterator iterator) { + this.iterator = iterator; + threadIterators.get().add(this); + } + + /** + * Returns the list of iterators created on this thread since the last time clearCreatedIterators() was called. + * @return the list of iterators created on this thread since the last time clearCreatedIterators() was called. + */ + public static List> getThreadIterators() { + return threadIterators.get(); + } + + /** + * Clears the tracked list of iterators created on this thread. + */ + public static void clearThreadIterators() { + threadIterators.get().clear(); + } + + @Override + public void close() { + iterator.close(); + this.closed = true; + } + + /** + * Returns true if this iterator was properly closed. + * @return true if this iterator was properly closed. + */ + public boolean isClosed() { + return closed; + } + + @Override public Iterator iterator() { return this; } + @Override public boolean hasNext() { return iterator.hasNext(); } + @Override public T next() { return iterator.next(); } + @Override public void remove() { iterator.remove(); } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java new file mode 100644 index 000000000..768bf50df --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FeatureToGATKFeatureIteratorUnitTest.java @@ -0,0 +1,61 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.tribble.Feature; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import htsjdk.variant.vcf.VCFCodec; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; + +public class FeatureToGATKFeatureIteratorUnitTest extends BaseTest { + @Test + @SuppressWarnings("unchecked") + public void testCloseFilePointers() throws IOException { + final String chr = "20"; + IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(BaseTest.hg19Reference)); + GenomeLocParser parser = new GenomeLocParser(seq); + File file = new File(privateTestDir + "NA12878.hg19.example1.vcf"); + VCFCodec codec = new VCFCodec(); + TestFeatureReader reader = new TestFeatureReader(file.getAbsolutePath(), codec); + CheckableCloseableTribbleIterator tribbleIterator = reader.query(chr, 1, 100000); + FeatureToGATKFeatureIterator gatkIterator = new FeatureToGATKFeatureIterator(parser, tribbleIterator, "test"); + Assert.assertTrue(gatkIterator.hasNext(), "GATK feature iterator does not have a next value."); + GenomeLoc gatkLocation = gatkIterator.next().getLocation(); + Assert.assertEquals(gatkLocation.getContig(), chr, "Instead of chr 20 rod iterator was at location " + gatkLocation); + Assert.assertFalse(tribbleIterator.isClosed(), "Tribble iterator is closed but should be still open."); + gatkIterator.close(); + Assert.assertTrue(tribbleIterator.isClosed(), "Tribble iterator is open but should be now closed."); + reader.close(); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FlashBackIteratorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FlashBackIteratorUnitTest.java new file mode 100644 index 000000000..34129abc8 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/FlashBackIteratorUnitTest.java @@ -0,0 +1,364 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMSequenceDictionary; +import org.testng.Assert; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.refdata.ReferenceOrderedDatum; +import org.broadinstitute.gatk.utils.GenomeLoc; +import org.broadinstitute.gatk.utils.GenomeLocParser; +import org.broadinstitute.gatk.utils.sam.ArtificialSAMUtils; + +import org.testng.annotations.BeforeMethod; + +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.AbstractList; +import java.util.ArrayList; +import java.util.List; + + +/** + * @author aaron + *

+ * Class FlashBackIteratorUnitTest + *

+ * just like a greatful dead show...this will be prone to flashbacks + */ +public class FlashBackIteratorUnitTest extends BaseTest { + private SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(NUMBER_OF_CHROMOSOMES, STARTING_CHROMOSOME, CHROMOSOME_SIZE); + private static final int NUMBER_OF_CHROMOSOMES = 5; + private static final int STARTING_CHROMOSOME = 1; + private static final int CHROMOSOME_SIZE = 1000; + + private String firstContig; + private GenomeLocParser genomeLocParser; + + @BeforeMethod + public void setup() { + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + firstContig = header.getSequenceDictionary().getSequence(0).getSequenceName(); + } + + @Test + public void testBasicIteration() { + GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); + FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); + GenomeLoc lastLocation = null; + for (int x = 0; x < 10; x++) { + iter.next(); + GenomeLoc cur = iter.position(); + if (lastLocation != null) { + Assert.assertTrue(lastLocation.isBefore(cur)); + } + lastLocation = cur; + } + } + + @Test + public void testBasicIterationThenFlashBack() { + GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); + FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); + GenomeLoc lastLocation = null; + for (int x = 0; x < 10; x++) { + iter.next(); + GenomeLoc cur = iter.position(); + if (lastLocation != null) { + Assert.assertTrue(lastLocation.isBefore(cur)); + } + lastLocation = cur; + } + iter.flashBackTo(genomeLocParser.createGenomeLoc(firstContig, 2)); + } + + @Test + public void testBasicIterationThenFlashBackThenIterate() { + GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); + FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); + GenomeLoc lastLocation = null; + for (int x = 0; x < 10; x++) { + iter.next(); + GenomeLoc cur = iter.position(); + if (lastLocation != null) { + Assert.assertTrue(lastLocation.isBefore(cur)); + } + lastLocation = cur; + } + iter.flashBackTo(genomeLocParser.createGenomeLoc(firstContig, 1)); + int count = 0; + while (iter.hasNext()) { + count++; + iter.next(); + } + Assert.assertEquals(count, 10); + } + + + @Test + public void testFlashBackTruth() { + GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); + LocationAwareSeekableRODIterator backIter = new FakeSeekableRODIterator(genomeLocParser,loc); + // remove the first three records + backIter.next(); + backIter.next(); + backIter.next(); + FlashBackIterator iter = new FlashBackIterator(backIter); + GenomeLoc lastLocation = null; + for (int x = 0; x < 10; x++) { + iter.next(); + GenomeLoc cur = iter.position(); + if (lastLocation != null) { + Assert.assertTrue(lastLocation.isBefore(cur)); + } + lastLocation = cur; + } + Assert.assertTrue(iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 5))); + Assert.assertTrue(iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 15))); + Assert.assertTrue(!iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 2))); + Assert.assertTrue(!iter.canFlashBackTo(genomeLocParser.createGenomeLoc(firstContig, 1))); + } + + @Test + public void testBasicIterationThenFlashBackHalfWayThenIterate() { + GenomeLoc loc = genomeLocParser.createGenomeLoc(firstContig, 0, 0); + FlashBackIterator iter = new FlashBackIterator(new FakeSeekableRODIterator(genomeLocParser,loc)); + GenomeLoc lastLocation = null; + for (int x = 0; x < 10; x++) { + iter.next(); + GenomeLoc cur = iter.position(); + if (lastLocation != null) { + Assert.assertTrue(lastLocation.isBefore(cur)); + } + lastLocation = cur; + } + iter.flashBackTo(genomeLocParser.createGenomeLoc(firstContig, 5)); + int count = 0; + while (iter.hasNext()) { + count++; + iter.next(); + } + Assert.assertEquals(count, 6); // chr1:5, 6, 7, 8, 9, and 10 + } +} + + +class FakeSeekableRODIterator implements LocationAwareSeekableRODIterator { + private GenomeLocParser genomeLocParser; + + // current location + private GenomeLoc location; + private FakeRODatum curROD; + private int recordCount = 10; + + public FakeSeekableRODIterator(GenomeLocParser genomeLocParser,GenomeLoc startingLoc) { + this.genomeLocParser = genomeLocParser; + this.location = genomeLocParser.createGenomeLoc(startingLoc.getContig(), startingLoc.getStart() + 1, startingLoc.getStop() + 1); + } + + /** + * Gets the header associated with the backing input stream. + * @return the ROD header. + */ + @Override + public Object getHeader() { + return null; + } + + /** + * Gets the sequence dictionary associated with the backing input stream. + * @return sequence dictionary from the ROD header. + */ + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return null; + } + + + @Override + public GenomeLoc peekNextLocation() { + System.err.println("Peek Next -> " + location); + return location; + } + + @Override + public GenomeLoc position() { + return location; + } + + @Override + public RODRecordList seekForward(GenomeLoc interval) { + this.location = interval; + return next(); + } + + @Override + public boolean hasNext() { + return (recordCount > 0); + } + + @Override + public RODRecordList next() { + RODRecordList list = new FakeRODRecordList(); + curROD = new FakeRODatum("STUPIDNAME", location); + location = genomeLocParser.createGenomeLoc(location.getContig(), location.getStart() + 1, location.getStop() + 1); + list.add(curROD); + recordCount--; + return list; + } + + @Override + public void remove() { + throw new IllegalStateException("GRRR"); + } + + @Override + public void close() { + // nothing to do + } +} + + +/** for testing only */ +class FakeRODatum extends GATKFeature implements ReferenceOrderedDatum { + + final GenomeLoc location; + + public FakeRODatum(String name, GenomeLoc location) { + super(name); + this.location = location; + } + + @Override + public String getName() { + return "false"; + } + + @Override + public boolean parseLine(Object header, String[] parts) throws IOException { + return false; + } + + @Override + public String toSimpleString() { + return ""; + } + + @Override + public String repl() { + return ""; + } + + /** + * Used by the ROD system to determine how to split input lines + * + * @return Regex string delimiter separating fields + */ + @Override + public String delimiterRegex() { + return ""; + } + + @Override + public GenomeLoc getLocation() { + return location; + } + + @Override + public Object getUnderlyingObject() { + return this; + } + + @Override + public int compareTo(ReferenceOrderedDatum that) { + return location.compareTo(that.getLocation()); + } + + /** + * Backdoor hook to read header, meta-data, etc. associated with the file. Will be + * called by the ROD system before streaming starts + * + * @param source source data file on disk from which this rod stream will be pulled + * + * @return a header object that will be passed to parseLine command + */ + @Override + public Object initialize(File source) throws FileNotFoundException { + return null; + } + + @Override + public String getChr() { + return location.getContig(); + } + + @Override + public int getStart() { + return (int)location.getStart(); + } + + @Override + public int getEnd() { + return (int)location.getStop(); + } +} + +class FakeRODRecordList extends AbstractList implements RODRecordList { + private final List list = new ArrayList(); + + public boolean add(GATKFeature data) { + return list.add(data); + } + + @Override + public GATKFeature get(int i) { + return list.get(i); + } + + @Override + public int size() { + return list.size(); + } + + @Override + public GenomeLoc getLocation() { + return list.get(0).getLocation(); + } + + @Override + public String getName() { + return "test"; + } + + @Override + public int compareTo(RODRecordList rodRecordList) { + return this.list.get(0).getLocation().compareTo(rodRecordList.getLocation()); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/TestFeatureReader.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/TestFeatureReader.java new file mode 100644 index 000000000..190ec846b --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/TestFeatureReader.java @@ -0,0 +1,53 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.tribble.Feature; +import htsjdk.tribble.FeatureCodec; +import htsjdk.tribble.TribbleIndexedFeatureReader; + +import java.io.IOException; + +/** + * Feature reader with additional test utilities. The iterators can be checked to see if they are closed. + */ +public class TestFeatureReader extends TribbleIndexedFeatureReader { + public TestFeatureReader(String featurePath, FeatureCodec codec) throws IOException { + super(featurePath, codec, true); + } + + @Override + @SuppressWarnings("unchecked") + public CheckableCloseableTribbleIterator iterator() throws IOException { + return new CheckableCloseableTribbleIterator(super.iterator()); + } + + @Override + @SuppressWarnings("unchecked") + public CheckableCloseableTribbleIterator query(String chr, int start, int end) throws IOException { + return new CheckableCloseableTribbleIterator(super.query(chr, start, end)); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/TestRMDTrackBuilder.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/TestRMDTrackBuilder.java new file mode 100644 index 000000000..2750e271e --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/refdata/utils/TestRMDTrackBuilder.java @@ -0,0 +1,70 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.refdata.utils; + +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.tribble.FeatureCodec; +import htsjdk.tribble.index.Index; +import org.broadinstitute.gatk.utils.refdata.tracks.FeatureManager; +import org.broadinstitute.gatk.utils.refdata.tracks.IndexDictionaryUtils; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrack; +import org.broadinstitute.gatk.utils.refdata.tracks.RMDTrackBuilder; +import org.broadinstitute.gatk.utils.GenomeLocParser; + +import java.io.File; +import java.io.IOException; + +/** + * Extension of RMDTrackBuilder that creates TestFeatureReader's which in turn create CheckableCloseableTribbleIterator's. + */ +public class TestRMDTrackBuilder extends RMDTrackBuilder { + private GenomeLocParser genomeLocParser; + + public TestRMDTrackBuilder(SAMSequenceDictionary dict, GenomeLocParser genomeLocParser) { + // disable auto-index creation/locking in the RMDTrackBuilder for tests + super(dict, genomeLocParser, null, true, null); + this.genomeLocParser = genomeLocParser; + } + + @Override + public RMDTrack createInstanceOfTrack(RMDTriplet fileDescriptor) { + String name = fileDescriptor.getName(); + File inputFile = new File(fileDescriptor.getFile()); + FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByTriplet(fileDescriptor); + FeatureCodec codec = getFeatureManager().createCodec(descriptor, name, genomeLocParser, null); + TestFeatureReader featureReader; + Index index; + try { + // Create a feature reader that creates checkable tribble iterators. + index = loadIndex(inputFile, codec); + featureReader = new TestFeatureReader(inputFile.getAbsolutePath(), codec); + } catch (IOException e) { + throw new RuntimeException(e); + } + SAMSequenceDictionary sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); + return new RMDTrack(descriptor.getCodecClass(), name, inputFile, featureReader, sequenceDictionary, genomeLocParser, codec); + } +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/report/GATKReportUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/report/GATKReportUnitTest.java new file mode 100644 index 000000000..fa34fb71b --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/report/GATKReportUnitTest.java @@ -0,0 +1,289 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.report; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.report.GATKReport; +import org.broadinstitute.gatk.utils.report.GATKReportColumn; +import org.broadinstitute.gatk.utils.report.GATKReportTable; +import org.broadinstitute.gatk.utils.report.GATKReportVersion; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.util.Random; +import java.io.FileInputStream; +import java.io.DataInputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.ArrayList; + + +public class GATKReportUnitTest extends BaseTest { + @Test + public void testParse() throws Exception { + String reportPath = publicTestDir + "exampleGATKReportv2.tbl"; + GATKReport report = new GATKReport(reportPath); + Assert.assertEquals(report.getVersion(), GATKReportVersion.V1_1); + Assert.assertEquals(report.getTables().size(), 5); + + GATKReportTable countVariants = report.getTable("CountVariants"); + Assert.assertEquals(countVariants.get(0, "nProcessedLoci"), "63025520"); + Assert.assertEquals(countVariants.get(0, "nNoCalls"), "0"); + Assert.assertEquals(countVariants.get(0, "heterozygosity"), 4.73e-06); + + GATKReportTable validationReport = report.getTable("ValidationReport"); + Assert.assertEquals(validationReport.get(2, "PPV"), Double.NaN); + } + + @DataProvider(name = "rightAlignValues") + public Object[][] getRightAlignValues() { + return new Object[][]{ + new Object[]{null, true}, + new Object[]{"null", true}, + new Object[]{"NA", true}, + new Object[]{"0", true}, + new Object[]{"0.0", true}, + new Object[]{"-0", true}, + new Object[]{"-0.0", true}, + new Object[]{String.valueOf(Long.MAX_VALUE), true}, + new Object[]{String.valueOf(Long.MIN_VALUE), true}, + new Object[]{String.valueOf(Float.MIN_NORMAL), true}, + new Object[]{String.valueOf(Double.MAX_VALUE), true}, + new Object[]{String.valueOf(Double.MIN_VALUE), true}, + new Object[]{String.valueOf(Double.POSITIVE_INFINITY), true}, + new Object[]{String.valueOf(Double.NEGATIVE_INFINITY), true}, + new Object[]{String.valueOf(Double.NaN), true}, + new Object[]{"hello", false} + }; + } + + @Test(dataProvider = "rightAlignValues") + public void testIsRightAlign(String value, boolean expected) { + Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); + } + + private GATKReportTable getTableWithRandomValues() { + Random number = new Random(123L); + final int VALUESRANGE = 10; + + GATKReport report = GATKReport.newSimpleReport("TableName", "col1", "col2", "col3"); + GATKReportTable table = new GATKReportTable("testSortingTable", "table with random values sorted by columns", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN ); + + final int NUMROWS = 100; + for (int x = 0; x < NUMROWS; x++) { + report.addRow(number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE)); + } + return table; + } + + @Test(enabled = true) + public void testSortingByColumn() { + Assert.assertEquals(isSorted(getTableWithRandomValues()), true); + } + + private boolean isSorted(GATKReportTable table) { + boolean result = true; + File testingSortingTableFile = new File("testSortingFile.txt"); + + try { + // Connect print stream to the output stream + PrintStream ps = new PrintStream(testingSortingTableFile); + table.write(ps); + ps.close(); + } + catch (Exception e){ + System.err.println ("Error: " + e.getMessage()); + } + + ArrayList rows = new ArrayList(); + try { + // Open the file + FileInputStream fStream = new FileInputStream(testingSortingTableFile); + // Get the object of DataInputStream + DataInputStream in = new DataInputStream(fStream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + + String[] parts = strLine.split(" "); + int l = parts.length; + int[] row = new int[l]; + for(int n = 0; n < l; n++) { + row[n] = Integer.parseInt(parts[n]); + } + rows.add(row); + } + //Close the input stream + in.close(); + } catch (Exception e){//Catch exception if any + System.err.println("Error: " + e.getMessage()); + } + for (int x = 1; x < rows.size() && result; x++) { + result = checkRowOrder(rows.get(x - 1), rows.get(x)); + } + return result; + } + + private boolean checkRowOrder(int[] row1, int[] row2) { + int l = row1.length; + final int EQUAL = 0; + + int result = EQUAL; + + for(int x = 0; x < l && ( result <= EQUAL); x++) { + result = ((Integer)row1[x]).compareTo(row2[x]); + } + if (result <= EQUAL) { + return true; + } else { + return false; + } + } + + private GATKReportTable makeBasicTable() { + GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value"); + GATKReportTable table = report.getTable("TableName"); + report.addRow("foo.1", "hello"); + report.addRow("foo.2", "world"); + return table; + } + + @Test + public void testDottedSampleName() { + GATKReportTable table = makeBasicTable(); + Assert.assertEquals(table.get(0, "value"), "hello"); + Assert.assertEquals(table.get(1, "value"), "world"); + } + + @Test + public void testSimpleGATKReport() { + // Create a new simple GATK report named "TableName" with columns: Roger, is, and Awesome + GATKReport report = GATKReport.newSimpleReport("TableName", "Roger", "is", "Awesome"); + + // Add data to simple GATK report + report.addRow(12, 23.45, true); + report.addRow("ans", '3', 24.5); + report.addRow("hi", "", 2.3); + + // Print the report to console + //report.print(System.out); + + try { + File file = createTempFile("GATKReportGatherer-UnitTest", ".tbl"); + //System.out.format("The temporary file" + " has been created: %s%n", file); + PrintStream ps = new PrintStream(file); + report.print(ps); + //System.out.println("File succesfully outputed!"); + GATKReport inputRead = new GATKReport(file); + //System.out.println("File succesfully read!"); + //inputRead.print(System.out); + Assert.assertTrue(report.isSameFormat(inputRead)); + + } catch (IOException x) { + System.err.format("IOException: %s%n", x); + } + + } + + @Test + public void testGATKReportGatherer() { + + GATKReport report1, report2, report3; + report1 = new GATKReport(); + report1.addTable("TableName", "Description", 2); + report1.getTable("TableName").addColumn("colA", "%s"); + report1.getTable("TableName").addColumn("colB", "%c"); + report1.getTable("TableName").set(0, "colA", "NotNum"); + report1.getTable("TableName").set(0, "colB", (char) 64); + + report2 = new GATKReport(); + report2.addTable("TableName", "Description", 2); + report2.getTable("TableName").addColumn("colA", "%s"); + report2.getTable("TableName").addColumn("colB", "%c"); + report2.getTable("TableName").set(0, "colA", "df3"); + report2.getTable("TableName").set(0, "colB", 'A'); + + report3 = new GATKReport(); + report3.addTable("TableName", "Description", 2); + report3.getTable("TableName").addColumn("colA", "%s"); + report3.getTable("TableName").addColumn("colB", "%c"); + report3.getTable("TableName").set(0, "colA", "df5f"); + report3.getTable("TableName").set(0, "colB", 'c'); + + report1.concat(report2); + report1.concat(report3); + + report1.addTable("Table2", "To contain some more data types", 3); + GATKReportTable table = report1.getTable("Table2"); + table.addColumn("SomeInt", "%d"); + table.addColumn("SomeFloat", "%.16E"); + table.addColumn("TrueFalse", "%B"); + table.addRowIDMapping("12df", 0); + table.addRowIDMapping("5f", 1); + table.addRowIDMapping("RZ", 2); + table.set("12df", "SomeInt", Byte.MAX_VALUE); + table.set("12df", "SomeFloat", 34.0); + table.set("12df", "TrueFalse", true); + table.set("5f", "SomeInt", Short.MAX_VALUE); + table.set("5f", "SomeFloat", Double.MAX_VALUE); + table.set("5f", "TrueFalse", false); + table.set("RZ", "SomeInt", Long.MAX_VALUE); + table.set("RZ", "SomeFloat", 535646345.657453464576); + table.set("RZ", "TrueFalse", true); + + report1.addTable("Table3", "blah", 1, GATKReportTable.TableSortingWay.SORT_BY_ROW); + report1.getTable("Table3").addColumn("a"); + report1.getTable("Table3").addRowIDMapping("q", 2); + report1.getTable("Table3").addRowIDMapping("5", 3); + report1.getTable("Table3").addRowIDMapping("573s", 0); + report1.getTable("Table3").addRowIDMapping("ZZZ", 1); + report1.getTable("Table3").set("q", "a", "34"); + report1.getTable("Table3").set("5", "a", "c4g34"); + report1.getTable("Table3").set("573s", "a", "fDlwueg"); + report1.getTable("Table3").set("ZZZ", "a", "Dfs"); + + try { + File file = createTempFile("GATKReportGatherer-UnitTest", ".tbl"); + //System.out.format("The temporary file" + " has been created: %s%n", file); + PrintStream ps = new PrintStream(file); + report1.print(ps); + //System.out.println("File succesfully outputed!"); + GATKReport inputRead = new GATKReport(file); + //System.out.println("File succesfully read!"); + //inputRead.print(System.out); + Assert.assertTrue(report1.isSameFormat(inputRead)); + Assert.assertTrue(report1.equals(inputRead)); + + } catch (IOException x) { + System.err.format("IOException: %s%n", x); + } + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/report/ReportMarshallerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/report/ReportMarshallerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/report/ReportMarshallerUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/report/ReportMarshallerUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/runtime/ProcessControllerUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/runtime/ProcessControllerUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/runtime/ProcessControllerUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/runtime/ProcessControllerUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/runtime/RuntimeUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/AlignmentUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialBAMBuilderUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialPatternedSAMIteratorUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileWriterUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileWriterUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileWriterUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMFileWriterUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIteratorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIteratorUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIteratorUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMQueryIteratorUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtilsUnitTest.java new file mode 100644 index 000000000..a4e6be203 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSAMUtilsUnitTest.java @@ -0,0 +1,108 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.iterators.GATKSAMIterator; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.fail; +import org.testng.annotations.Test; +import htsjdk.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Jun 3, 2009 + * Time: 3:09:34 AM + * To change this template use File | Settings | File Templates. + */ +public class ArtificialSAMUtilsUnitTest extends BaseTest { + + + @Test + public void basicReadIteratorTest() { + GATKSAMIterator iter = ArtificialSAMUtils.mappedReadIterator(1, 100, 100); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + count++; + } + assertEquals(count, 100 * 100); + } + + @Test + public void tenPerChromosome() { + GATKSAMIterator iter = ArtificialSAMUtils.mappedReadIterator(1, 100, 10); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + assertEquals(Integer.valueOf(Math.round(count / 10)), rec.getReferenceIndex()); + count++; + } + assertEquals(count, 100 * 10); + } + + @Test + public void onePerChromosome() { + GATKSAMIterator iter = ArtificialSAMUtils.mappedReadIterator(1, 100, 1); + int count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + + assertEquals(Integer.valueOf(count), rec.getReferenceIndex()); + count++; + } + assertEquals(count, 100 * 1); + } + + @Test + public void basicUnmappedIteratorTest() { + GATKSAMIterator iter = ArtificialSAMUtils.mappedAndUnmappedReadIterator(1, 100, 100, 1000); + int count = 0; + for (int x = 0; x < (100* 100); x++ ) { + if (!iter.hasNext()) { + fail ("we didn't get the expected number of reads"); + } + SAMRecord rec = iter.next(); + assertTrue(rec.getReferenceIndex() >= 0); + count++; + } + assertEquals(100 * 100, count); + + // now we should have 1000 unmapped reads + count = 0; + while (iter.hasNext()) { + SAMRecord rec = iter.next(); + assertTrue(rec.getReferenceIndex() < 0); + count++; + } + assertEquals(count, 1000); + } + + +} diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java new file mode 100644 index 000000000..4117f7487 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java @@ -0,0 +1,186 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import org.broadinstitute.gatk.utils.BaseTest; + +public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { + + private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { + super(ArtificialSingleSampleReadStreamTest.class); + + this.stream = stream; + + setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); + + streamAnalyzer.analyze(stream); + + // Check whether the observed properties of the stream match its nominal properties + streamAnalyzer.validate(); + } + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") + public Object[][] createArtificialSingleSampleReadStreamTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + Utils.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { + for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { + for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { + for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { + for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { + for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { + for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { + for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { + for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { + // Only test sane combinations here + if ( minReadsPerStack <= maxReadsPerStack && + minDistanceBetweenStacks <= maxDistanceBetweenStacks && + minReadLength <= maxReadLength && + ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { + + new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads)); + } + } + } + } + } + } + } + } + } + } + + return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") + public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { + logger.warn("Running test: " + test); + + Utils.resetRandomGenerator(); + test.run(); + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") + public Object[][] createInvalidArgumentsTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + header.addReadGroup(new SAMReadGroupRecord(readGroupID)); + + return new Object[][] { + {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, + {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, + {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, + {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, + {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, + {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, + {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, + {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, + {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, + {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, + }; + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", + expectedExceptions = ReviewedGATKException.class) + public void testInvalidArguments( String testName, + SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + + logger.warn("Running test: " + testName); + + ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + numStacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecordUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecordUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecordUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/GATKSAMRecordUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java new file mode 100644 index 000000000..14b56718e --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/sam/ReadUtilsUnitTest.java @@ -0,0 +1,339 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.sam; + +import htsjdk.samtools.reference.IndexedFastaSequenceFile; +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.BaseUtils; +import org.broadinstitute.gatk.utils.Utils; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + + +public class ReadUtilsUnitTest extends BaseTest { + private interface GetAdaptorFunc { + public int getAdaptor(final GATKSAMRecord record); + } + + @DataProvider(name = "AdaptorGetter") + public Object[][] makeActiveRegionCutTests() { + final List tests = new LinkedList(); + + tests.add( new Object[]{ new GetAdaptorFunc() { + @Override public int getAdaptor(final GATKSAMRecord record) { return ReadUtils.getAdaptorBoundary(record); } + }}); + + tests.add( new Object[]{ new GetAdaptorFunc() { + @Override public int getAdaptor(final GATKSAMRecord record) { return record.getAdaptorBoundary(); } + }}); + + return tests.toArray(new Object[][]{}); + } + + private GATKSAMRecord makeRead(final int fragmentSize, final int mateStart) { + final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; + final byte[] quals = {30, 30, 30, 30, 30, 30, 30, 30}; + final String cigar = "8M"; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, cigar); + read.setProperPairFlag(true); + read.setReadPairedFlag(true); + read.setMateAlignmentStart(mateStart); + read.setInferredInsertSize(fragmentSize); + return read; + } + + @Test(dataProvider = "AdaptorGetter") + public void testGetAdaptorBoundary(final GetAdaptorFunc get) { + final int fragmentSize = 10; + final int mateStart = 1000; + final int BEFORE = mateStart - 2; + final int AFTER = mateStart + 2; + int myStart, boundary; + GATKSAMRecord read; + + // Test case 1: positive strand, first read + read = makeRead(fragmentSize, mateStart); + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 2: positive strand, second read + read = makeRead(fragmentSize, mateStart); + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, myStart + fragmentSize + 1); + + // Test case 3: negative strand, second read + read = makeRead(fragmentSize, mateStart); + myStart = AFTER; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, mateStart - 1); + + // Test case 4: negative strand, first read + read = makeRead(fragmentSize, mateStart); + myStart = BEFORE; + read.setAlignmentStart(myStart); + read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, mateStart - 1); + + // Test case 5: mate is mapped to another chromosome (test both strands) + read = makeRead(fragmentSize, mateStart); + read.setInferredInsertSize(0); + read.setReadNegativeStrandFlag(true); + read.setMateNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + read.setInferredInsertSize(10); + + // Test case 6: read is unmapped + read = makeRead(fragmentSize, mateStart); + read.setReadUnmappedFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + read.setReadUnmappedFlag(false); + + // Test case 7: reads don't overlap and look like this: + // <--------| + // |------> + // first read: + read = makeRead(fragmentSize, mateStart); + myStart = 980; + read.setAlignmentStart(myStart); + read.setInferredInsertSize(20); + read.setReadNegativeStrandFlag(true); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + + // second read: + read = makeRead(fragmentSize, mateStart); + myStart = 1000; + read.setAlignmentStart(myStart); + read.setInferredInsertSize(20); + read.setMateAlignmentStart(980); + read.setReadNegativeStrandFlag(false); + boundary = get.getAdaptor(read); + Assert.assertEquals(boundary, ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + + // Test case 8: read doesn't have proper pair flag set + read = makeRead(fragmentSize, mateStart); + read.setReadPairedFlag(true); + read.setProperPairFlag(false); + Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY); + + // Test case 9: read and mate have same negative flag setting + for ( final boolean negFlag: Arrays.asList(true, false) ) { + read = makeRead(fragmentSize, mateStart); + read.setAlignmentStart(BEFORE); + read.setReadPairedFlag(true); + read.setProperPairFlag(true); + read.setReadNegativeStrandFlag(negFlag); + read.setMateNegativeStrandFlag(!negFlag); + Assert.assertTrue(get.getAdaptor(read) != ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have succeeded"); + + read = makeRead(fragmentSize, mateStart); + read.setAlignmentStart(BEFORE); + read.setReadPairedFlag(true); + read.setProperPairFlag(true); + read.setReadNegativeStrandFlag(negFlag); + read.setMateNegativeStrandFlag(negFlag); + Assert.assertEquals(get.getAdaptor(read), ReadUtils.CANNOT_COMPUTE_ADAPTOR_BOUNDARY, "Get adaptor should have failed for reads with bad alignment orientation"); + } + } + + @Test (enabled = true) + public void testGetBasesReverseComplement() { + int iterations = 1000; + Random random = Utils.getRandomGenerator(); + while(iterations-- > 0) { + final int l = random.nextInt(1000); + GATKSAMRecord read = GATKSAMRecord.createRandomRead(l); + byte [] original = read.getReadBases(); + byte [] reconverted = new byte[l]; + String revComp = ReadUtils.getBasesReverseComplement(read); + for (int i=0; i reads = new ArrayList(); + for( int readLength = minLength; readLength <= maxLength; readLength++ ) { + reads.add( ReadUtils.createRandomRead( readLength ) ); + } + Assert.assertEquals(ReadUtils.getMaxReadLength(reads), maxLength, "max length does not match"); + } + } + + final List reads = new LinkedList(); + Assert.assertEquals(ReadUtils.getMaxReadLength(reads), 0, "Empty list should have max length of zero"); + } + + @Test (enabled = true) + public void testReadWithNsRefIndexInDeletion() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9392, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 2); + } + + @Test (enabled = true) + public void testReadWithNsRefAfterDeletion() throws FileNotFoundException { + + final IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); + final int readLength = 76; + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead", 0, 8975, readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); + read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); + read.setCigarString("3M414N1D73M"); + + final int result = ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, 9393, ReadUtils.ClippingTail.LEFT_TAIL); + Assert.assertEquals(result, 3); + } + + @DataProvider(name = "HasWellDefinedFragmentSizeData") + public Object[][] makeHasWellDefinedFragmentSizeData() throws Exception { + final List tests = new LinkedList(); + + // setup a basic read that will work + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, 10); + read.setReadPairedFlag(true); + read.setProperPairFlag(true); + read.setReadUnmappedFlag(false); + read.setMateUnmappedFlag(false); + read.setAlignmentStart(100); + read.setCigarString("50M"); + read.setMateAlignmentStart(130); + read.setInferredInsertSize(80); + read.setFirstOfPairFlag(true); + read.setReadNegativeStrandFlag(false); + read.setMateNegativeStrandFlag(true); + + tests.add( new Object[]{ "basic case", read.clone(), true }); + + { + final GATKSAMRecord bad1 = (GATKSAMRecord)read.clone(); + bad1.setReadPairedFlag(false); + tests.add( new Object[]{ "not paired", bad1, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setProperPairFlag(false); + // we currently don't require the proper pair flag to be set + tests.add( new Object[]{ "not proper pair", bad, true }); +// tests.add( new Object[]{ "not proper pair", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadUnmappedFlag(true); + tests.add( new Object[]{ "read is unmapped", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setMateUnmappedFlag(true); + tests.add( new Object[]{ "mate is unmapped", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setMateNegativeStrandFlag(false); + tests.add( new Object[]{ "read and mate both on positive strand", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadNegativeStrandFlag(true); + tests.add( new Object[]{ "read and mate both on negative strand", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setInferredInsertSize(0); + tests.add( new Object[]{ "insert size is 0", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setAlignmentStart(1000); + tests.add( new Object[]{ "positve read starts after mate end", bad, false }); + } + + { + final GATKSAMRecord bad = (GATKSAMRecord)read.clone(); + bad.setReadNegativeStrandFlag(true); + bad.setMateNegativeStrandFlag(false); + bad.setMateAlignmentStart(1000); + tests.add( new Object[]{ "negative strand read ends before mate starts", bad, false }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "HasWellDefinedFragmentSizeData") + private void testHasWellDefinedFragmentSize(final String name, final GATKSAMRecord read, final boolean expected) { + Assert.assertEquals(ReadUtils.hasWellDefinedFragmentSize(read), expected); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWatermanBenchmark.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWatermanBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWatermanBenchmark.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/smithwaterman/SmithWatermanBenchmark.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/text/ListFileUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/text/ListFileUtilsUnitTest.java new file mode 100644 index 000000000..ac301ce37 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/text/ListFileUtilsUnitTest.java @@ -0,0 +1,154 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.text; + +import org.broadinstitute.gatk.utils.BaseTest; +import org.broadinstitute.gatk.utils.commandline.ParsingEngine; +import org.broadinstitute.gatk.utils.commandline.Tags; +import org.broadinstitute.gatk.utils.sam.SAMReaderID; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Tests selected functionality in the CommandLineExecutable class + */ +public class ListFileUtilsUnitTest extends BaseTest { + + @Test + public void testIgnoreBlankLinesInBAMListFiles() throws Exception { + File tempListFile = createTempListFile("testIgnoreBlankLines", + "", + publicTestDir + "exampleBAM.bam", + " " + ); + + List expectedBAMFileListAfterUnpacking = new ArrayList(); + expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + + performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); + } + + @Test + public void testCommentSupportInBAMListFiles() throws Exception { + File tempListFile = createTempListFile("testCommentSupport", + "#", + publicTestDir + "exampleBAM.bam", + "#" + publicTestDir + "foo.bam", + " # " + publicTestDir + "bar.bam" + ); + + List expectedBAMFileListAfterUnpacking = new ArrayList(); + expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File(publicTestDir + "exampleBAM.bam"), new Tags())); + + performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); + } + + @Test + public void testUnpackSet() throws Exception { + Set expected = new HashSet(Arrays.asList(publicTestDir + "exampleBAM.bam")); + Set actual; + + actual = ListFileUtils.unpackSet(Arrays.asList(publicTestDir + "exampleBAM.bam")); + Assert.assertEquals(actual, expected); + + File tempListFile = createTempListFile("testUnpackSet", + "#", + publicTestDir + "exampleBAM.bam", + "#" + publicTestDir + "foo.bam", + " # " + publicTestDir + "bar.bam" + ); + actual = ListFileUtils.unpackSet(Arrays.asList(tempListFile.getAbsolutePath())); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="includeMatchingTests") + public Object[][] getIncludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("a", "ab") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, asSet("a", "ab", "abc") } + }; + } + + @Test(dataProvider = "includeMatchingTests") + public void testIncludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.includeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + @DataProvider(name="excludeMatchingTests") + public Object[][] getExcludeMatchingTests() { + return new Object[][] { + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("b"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), true, asSet("ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "b"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), true, asSet("abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList("a", "ab"), false, Collections.EMPTY_SET }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*b.*"), false, asSet("a") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), true, asSet("a", "ab", "abc") }, + new Object[] { asSet("a", "ab", "abc"), Arrays.asList(".*"), false, Collections.EMPTY_SET } + }; + } + + @Test(dataProvider = "excludeMatchingTests") + public void testExcludeMatching(Set values, Collection filters, boolean exactMatch, Set expected) { + Set actual = ListFileUtils.excludeMatching(values, ListFileUtils.IDENTITY_STRING_CONVERTER, filters, exactMatch); + Assert.assertEquals(actual, expected); + } + + private static Set asSet(T... args){ + return new HashSet(Arrays.asList(args)); + } + + private void performBAMListFileUnpackingTest( File tempListFile, List expectedUnpackedFileList ) throws Exception { + List bamFiles = new ArrayList(); + bamFiles.add(tempListFile.getAbsolutePath()); + + List unpackedBAMFileList = ListFileUtils.unpackBAMFileList(bamFiles,new ParsingEngine(null)); + + Assert.assertEquals(unpackedBAMFileList.size(), expectedUnpackedFileList.size(), + "Unpacked BAM file list contains extraneous lines"); + Assert.assertEquals(unpackedBAMFileList, expectedUnpackedFileList, + "Unpacked BAM file list does not contain correct BAM file names"); + } +} diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/text/TextFormattingUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/text/TextFormattingUtilsUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/text/TextFormattingUtilsUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/text/TextFormattingUtilsUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitorUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitorUnitTest.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitorUnitTest.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/threading/ThreadPoolMonitorUnitTest.java diff --git a/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java new file mode 100644 index 000000000..860c54736 --- /dev/null +++ b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -0,0 +1,1611 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.gatk.utils.variant; + +import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.utils.*; +import org.broadinstitute.gatk.utils.collections.Pair; +import org.broadinstitute.gatk.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +public class GATKVariantContextUtilsUnitTest extends BaseTest { + private final static boolean DEBUG = false; + + Allele Aref, T, C, G, Cref, ATC, ATCATC; + Allele ATCATCT; + Allele ATref; + Allele Anoref; + Allele GT; + + private GenomeLocParser genomeLocParser; + + @BeforeSuite + public void setup() throws IOException { + // alleles + Aref = Allele.create("A", true); + Cref = Allele.create("C", true); + T = Allele.create("T"); + C = Allele.create("C"); + G = Allele.create("G"); + ATC = Allele.create("ATC"); + ATCATC = Allele.create("ATCATC"); + ATCATCT = Allele.create("ATCATCT"); + ATref = Allele.create("AT",true); + Anoref = Allele.create("A",false); + GT = Allele.create("GT",false); + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(hg18Reference))); + } + + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError, int... pls) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).PL(pls).make(); + } + + + private Genotype makeG(String sample, Allele a1, Allele a2, double log10pError) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).log10PError(log10pError).make(); + } + + private VariantContext makeVC(String source, List alleles) { + return makeVC(source, alleles, null, null); + } + + private VariantContext makeVC(String source, List alleles, Genotype... g1) { + return makeVC(source, alleles, Arrays.asList(g1)); + } + + private VariantContext makeVC(String source, List alleles, String filter) { + return makeVC(source, alleles, filter.equals(".") ? null : new HashSet(Arrays.asList(filter))); + } + + private VariantContext makeVC(String source, List alleles, Set filters) { + return makeVC(source, alleles, null, filters); + } + + private VariantContext makeVC(String source, List alleles, Collection genotypes) { + return makeVC(source, alleles, genotypes, null); + } + + private VariantContext makeVC(String source, List alleles, Collection genotypes, Set filters) { + int start = 10; + int stop = start + alleles.get(0).length() - 1; // alleles.contains(ATC) ? start + 3 : start; + return new VariantContextBuilder(source, "1", start, stop, alleles).genotypes(genotypes).filters(filters).make(); + } + + // -------------------------------------------------------------------------------- + // + // Test allele merging + // + // -------------------------------------------------------------------------------- + + private class MergeAllelesTest extends TestDataProvider { + List> inputs; + List expected; + + private MergeAllelesTest(List... arg) { + super(MergeAllelesTest.class); + LinkedList> all = new LinkedList<>(Arrays.asList(arg)); + expected = all.pollLast(); + inputs = all; + } + + public String toString() { + return String.format("MergeAllelesTest input=%s expected=%s", inputs, expected); + } + } + @DataProvider(name = "mergeAlleles") + public Object[][] mergeAllelesData() { + // first, do no harm + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref), + Arrays.asList(Aref)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, T), + Arrays.asList(Aref, T)); + + new MergeAllelesTest(Arrays.asList(Aref, C), + Arrays.asList(Aref, T), + Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, T), + Arrays.asList(Aref, C), + Arrays.asList(Aref, T, C)); // in order of appearence + + new MergeAllelesTest(Arrays.asList(Aref, C, T), + Arrays.asList(Aref, C), + Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, C, T), Arrays.asList(Aref, C, T)); + + new MergeAllelesTest(Arrays.asList(Aref, T, C), Arrays.asList(Aref, T, C)); + + new MergeAllelesTest(Arrays.asList(Aref, T, C), + Arrays.asList(Aref, C), + Arrays.asList(Aref, T, C)); // in order of appearence + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, ATC), + Arrays.asList(Aref, ATC)); + + new MergeAllelesTest(Arrays.asList(Aref), + Arrays.asList(Aref, ATC, ATCATC), + Arrays.asList(Aref, ATC, ATCATC)); + + // alleles in the order we see them + new MergeAllelesTest(Arrays.asList(Aref, ATCATC), + Arrays.asList(Aref, ATC, ATCATC), + Arrays.asList(Aref, ATCATC, ATC)); + + // same + new MergeAllelesTest(Arrays.asList(Aref, ATC), + Arrays.asList(Aref, ATCATC), + Arrays.asList(Aref, ATC, ATCATC)); + + new MergeAllelesTest(Arrays.asList(ATref, ATC, Anoref, G), + Arrays.asList(Aref, ATCATC, G), + Arrays.asList(ATref, ATC, Anoref, G, ATCATCT, GT)); + + return MergeAllelesTest.getTests(MergeAllelesTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeAlleles") + public void testMergeAlleles(MergeAllelesTest cfg) { + final List inputs = new ArrayList(); + + int i = 0; + for ( final List alleles : cfg.inputs ) { + final String name = "vcf" + ++i; + inputs.add(makeVC(name, alleles)); + } + + final List priority = vcs2priority(inputs); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + inputs, priority, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); + + Assert.assertEquals(merged.getAlleles().size(),cfg.expected.size()); + Assert.assertEquals(merged.getAlleles(), cfg.expected); + } + + // -------------------------------------------------------------------------------- + // + // Test rsID merging + // + // -------------------------------------------------------------------------------- + + private class SimpleMergeRSIDTest extends TestDataProvider { + List inputs; + String expected; + + private SimpleMergeRSIDTest(String... arg) { + super(SimpleMergeRSIDTest.class); + LinkedList allStrings = new LinkedList(Arrays.asList(arg)); + expected = allStrings.pollLast(); + inputs = allStrings; + } + + public String toString() { + return String.format("SimpleMergeRSIDTest vc=%s expected=%s", inputs, expected); + } + } + + @DataProvider(name = "simplemergersiddata") + public Object[][] createSimpleMergeRSIDData() { + new SimpleMergeRSIDTest(".", "."); + new SimpleMergeRSIDTest(".", ".", "."); + new SimpleMergeRSIDTest("rs1", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs1", "rs1"); + new SimpleMergeRSIDTest(".", "rs1", "rs1"); + new SimpleMergeRSIDTest("rs1", ".", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs1,rs2"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs1", "rs1,rs2"); // duplicates + new SimpleMergeRSIDTest("rs2", "rs1", "rs2,rs1"); + new SimpleMergeRSIDTest("rs2", "rs1", ".", "rs2,rs1"); + new SimpleMergeRSIDTest("rs2", ".", "rs1", "rs2,rs1"); + new SimpleMergeRSIDTest("rs1", ".", ".", "rs1"); + new SimpleMergeRSIDTest("rs1", "rs2", "rs3", "rs1,rs2,rs3"); + + return SimpleMergeRSIDTest.getTests(SimpleMergeRSIDTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "simplemergersiddata") + public void testRSIDMerge(SimpleMergeRSIDTest cfg) { + VariantContext snpVC1 = makeVC("snpvc1", Arrays.asList(Aref, T)); + final List inputs = new ArrayList(); + + for ( final String id : cfg.inputs ) { + inputs.add(new VariantContextBuilder(snpVC1).id(id).make()); + } + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + inputs, null, + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); + Assert.assertEquals(merged.getID(), cfg.expected); + } + + // -------------------------------------------------------------------------------- + // + // Test filtered merging + // + // -------------------------------------------------------------------------------- + + private class MergeFilteredTest extends TestDataProvider { + List inputs; + VariantContext expected; + String setExpected; + GATKVariantContextUtils.FilteredRecordMergeType type; + + + private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, String setExpected) { + this(name, input1, input2, expected, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, setExpected); + } + + private MergeFilteredTest(String name, VariantContext input1, VariantContext input2, VariantContext expected, GATKVariantContextUtils.FilteredRecordMergeType type, String setExpected) { + super(MergeFilteredTest.class, name); + LinkedList all = new LinkedList(Arrays.asList(input1, input2)); + this.expected = expected; + this.type = type; + inputs = all; + this.setExpected = setExpected; + } + + public String toString() { + return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); + } + } + + @DataProvider(name = "mergeFiltered") + public Object[][] mergeFilteredData() { + new MergeFilteredTest("AllPass", + makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("noFilters", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "."), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("oneFiltered", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "."), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("onePassOneFail", + makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("AllFiltered", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "FAIL"), + GATKVariantContextUtils.MERGE_FILTER_IN_ALL); + + // test ALL vs. ANY + new MergeFilteredTest("FailOneUnfiltered", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "."), + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + new MergeFilteredTest("OneFailAllUnfilteredArg", + makeVC("1", Arrays.asList(Aref, T), "FAIL"), + makeVC("2", Arrays.asList(Aref, T), "."), + makeVC("3", Arrays.asList(Aref, T), "FAIL"), + GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ALL_UNFILTERED, + String.format("%s1-2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + // test excluding allele in filtered record + new MergeFilteredTest("DontIncludeAlleleOfFilteredRecords", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), "FAIL"), + makeVC("3", Arrays.asList(Aref, T), "."), + String.format("1-%s2", GATKVariantContextUtils.MERGE_FILTER_PREFIX)); + + // promotion of site from unfiltered to PASSES + new MergeFilteredTest("UnfilteredPlusPassIsPass", + makeVC("1", Arrays.asList(Aref, T), "."), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_INTERSECTION); + + new MergeFilteredTest("RefInAll", + makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + GATKVariantContextUtils.MERGE_REF_IN_ALL); + + new MergeFilteredTest("RefInOne", + makeVC("1", Arrays.asList(Aref), VariantContext.PASSES_FILTERS), + makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + makeVC("3", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS), + "2"); + + return MergeFilteredTest.getTests(MergeFilteredTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeFiltered") + public void testMergeFiltered(MergeFilteredTest cfg) { + final List priority = vcs2priority(cfg.inputs); + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + + // test alleles are equal + Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); + + // test set field + Assert.assertEquals(merged.getAttribute("set"), cfg.setExpected); + + // test filter field + Assert.assertEquals(merged.getFilters(), cfg.expected.getFilters()); + } + + // -------------------------------------------------------------------------------- + // + // Test genotype merging + // + // -------------------------------------------------------------------------------- + + private class MergeGenotypesTest extends TestDataProvider { + List inputs; + VariantContext expected; + List priority; + + private MergeGenotypesTest(String name, String priority, VariantContext... arg) { + super(MergeGenotypesTest.class, name); + LinkedList all = new LinkedList(Arrays.asList(arg)); + this.expected = all.pollLast(); + inputs = all; + this.priority = Arrays.asList(priority.split(",")); + } + + public String toString() { + return String.format("%s input=%s expected=%s", super.toString(), inputs, expected); + } + } + + @DataProvider(name = "mergeGenotypes") + public Object[][] mergeGenotypesData() { + new MergeGenotypesTest("TakeGenotypeByPriority-1,2", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1))); + + new MergeGenotypesTest("TakeGenotypeByPriority-1,2-nocall", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1))); + + new MergeGenotypesTest("TakeGenotypeByPriority-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2))); + + new MergeGenotypesTest("NonOverlappingGenotypes", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s2", Aref, T, -2))); + + new MergeGenotypesTest("PreserveNoCall", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s2", Aref, T, -2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Allele.NO_CALL, Allele.NO_CALL, -1), makeG("s2", Aref, T, -2))); + + new MergeGenotypesTest("PerserveAlleles", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, C), makeG("s2", Aref, C, -2)), + makeVC("3", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1), makeG("s2", Aref, C, -2))); + + new MergeGenotypesTest("TakeGenotypePartialOverlap-1,2", "1,2", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1), makeG("s3", Aref, T, -3))); + + new MergeGenotypesTest("TakeGenotypePartialOverlap-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2), makeG("s3", Aref, T, -3))); + + // + // merging genothpes with PLs + // + + // first, do no harm + new MergeGenotypesTest("OrderedPLs", "1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3)), + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1, 1, 2, 3))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles", "1", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6))); + + // first, do no harm + new MergeGenotypesTest("OrderedPLs-3Alleles-2", "1", + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("1", Arrays.asList(Aref, T, C), makeG("s1", Aref, T, -1, 1, 2, 3, 4, 5, 6), makeG("s2", Aref, C, -1, 1, 2, 3, 4, 5, 6))); + + new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-2,1", "2,1", + makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + makeVC("3", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2))); + + new MergeGenotypesTest("TakeGenotypePartialOverlapWithPLs-1,2", "1,2", + makeVC("1", Arrays.asList(Aref,ATC), makeG("s1", Aref, ATC, -1,5,0,3)), + makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2,4,0,2), makeG("s3", Aref, T, -3,3,0,2)), + // no likelihoods on result since type changes to mixed multiallelic + makeVC("3", Arrays.asList(Aref, ATC, T), makeG("s1", Aref, ATC, -1), makeG("s3", Aref, T, -3))); + + new MergeGenotypesTest("MultipleSamplePLsDifferentOrder", "1,2", + makeVC("1", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1, 1, 2, 3, 4, 5, 6)), + makeVC("2", Arrays.asList(Aref, T, C), makeG("s2", Aref, T, -2, 6, 5, 4, 3, 2, 1)), + // no likelihoods on result since type changes to mixed multiallelic + makeVC("3", Arrays.asList(Aref, C, T), makeG("s1", Aref, C, -1), makeG("s2", Aref, T, -2))); + + return MergeGenotypesTest.getTests(MergeGenotypesTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "mergeGenotypes") + public void testMergeGenotypes(MergeGenotypesTest cfg) { + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); + + // test alleles are equal + Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); + + // test genotypes + assertGenotypesAreMostlyEqual(merged.getGenotypes(), cfg.expected.getGenotypes()); + } + + // necessary to not overload equals for genotypes + private void assertGenotypesAreMostlyEqual(GenotypesContext actual, GenotypesContext expected) { + if (actual == expected) { + return; + } + + if (actual == null || expected == null) { + Assert.fail("Maps not equal: expected: " + expected + " and actual: " + actual); + } + + if (actual.size() != expected.size()) { + Assert.fail("Maps do not have the same size:" + actual.size() + " != " + expected.size()); + } + + for (Genotype value : actual) { + Genotype expectedValue = expected.get(value.getSampleName()); + + Assert.assertEquals(value.getAlleles(), expectedValue.getAlleles(), "Alleles in Genotype aren't equal"); + Assert.assertEquals(value.getGQ(), expectedValue.getGQ(), "GQ values aren't equal"); + Assert.assertEquals(value.hasLikelihoods(), expectedValue.hasLikelihoods(), "Either both have likelihoods or both not"); + if ( value.hasLikelihoods() ) + Assert.assertEquals(value.getLikelihoods().getAsVector(), expectedValue.getLikelihoods().getAsVector(), "Genotype likelihoods aren't equal"); + } + } + + @Test(enabled = !DEBUG) + public void testMergeGenotypesUniquify() { + final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); + final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); + + // test genotypes + Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); + } + +// TODO: remove after testing +// @Test(expectedExceptions = IllegalStateException.class) +// public void testMergeGenotypesRequireUnique() { +// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); +// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); +// +// final VariantContext merged = VariantContextUtils.simpleMerge( +// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false, false); +// } + + // -------------------------------------------------------------------------------- + // + // Misc. tests + // + // -------------------------------------------------------------------------------- + + @Test(enabled = !DEBUG) + public void testAnnotationSet() { + for ( final boolean annotate : Arrays.asList(true, false)) { + for ( final String set : Arrays.asList("set", "combine", "x")) { + final List priority = Arrays.asList("1", "2"); + VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); + VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), VariantContext.PASSES_FILTERS); + + final VariantContext merged = GATKVariantContextUtils.simpleMerge( + Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); + + if ( annotate ) + Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); + else + Assert.assertFalse(merged.hasAttribute(set)); + } + } + } + + private static final List vcs2priority(final Collection vcs) { + final List priority = new ArrayList<>(); + + for ( final VariantContext vc : vcs ) { + priority.add(vc.getSource()); + } + + return priority; + } + + // -------------------------------------------------------------------------------- + // + // basic allele clipping test + // + // -------------------------------------------------------------------------------- + + private class ReverseClippingPositionTestProvider extends TestDataProvider { + final String ref; + final List alleles = new ArrayList(); + final int expectedClip; + + private ReverseClippingPositionTestProvider(final int expectedClip, final String ref, final String... alleles) { + super(ReverseClippingPositionTestProvider.class); + this.ref = ref; + for ( final String allele : alleles ) + this.alleles.add(Allele.create(allele)); + this.expectedClip = expectedClip; + } + + @Override + public String toString() { + return String.format("ref=%s allele=%s reverse clip %d", ref, alleles, expectedClip); + } + } + + @DataProvider(name = "ReverseClippingPositionTestProvider") + public Object[][] makeReverseClippingPositionTestProvider() { + // pair clipping + new ReverseClippingPositionTestProvider(0, "ATT", "CCG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CCT"); + new ReverseClippingPositionTestProvider(2, "ATT", "CTT"); + new ReverseClippingPositionTestProvider(2, "ATT", "ATT"); // cannot completely clip allele + + // triplets + new ReverseClippingPositionTestProvider(0, "ATT", "CTT", "CGG"); + new ReverseClippingPositionTestProvider(1, "ATT", "CTT", "CGT"); // the T can go + new ReverseClippingPositionTestProvider(2, "ATT", "CTT", "CTT"); // both Ts can go + + return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); + } + + @Test(enabled = !DEBUG, dataProvider = "ReverseClippingPositionTestProvider") + public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { + int result = GATKVariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes()); + Assert.assertEquals(result, cfg.expectedClip); + } + + + // -------------------------------------------------------------------------------- + // + // test splitting into bi-allelics + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "SplitBiallelics") + public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); + + // biallelic -> biallelic + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + // monos -> monos + root.alleles(Arrays.asList(Aref)); + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + root.alleles(Arrays.asList(Aref, C, T)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make())}); + + root.alleles(Arrays.asList(Aref, C, T, G)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make(), + root.alleles(Arrays.asList(Aref, G)).make())}); + + final Allele C = Allele.create("C"); + final Allele CA = Allele.create("CA"); + final Allele CAA = Allele.create("CAA"); + final Allele CAAAA = Allele.create("CAAAA"); + final Allele CAAAAA = Allele.create("CAAAAA"); + final Allele Cref = Allele.create("C", true); + final Allele CAref = Allele.create("CA", true); + final Allele CAAref = Allele.create("CAA", true); + final Allele CAAAref = Allele.create("CAAA", true); + + root.alleles(Arrays.asList(Cref, CA, CAA)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CA)).make(), + root.alleles(Arrays.asList(Cref, CAA)).make())}); + + root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAref, C)).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAAref, C)).make(), + root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), + root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make(), + root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); + + final Allele threeCopies = Allele.create("GTTTTATTTTATTTTA", true); + final Allele twoCopies = Allele.create("GTTTTATTTTA", true); + final Allele zeroCopies = Allele.create("G", false); + final Allele oneCopies = Allele.create("GTTTTA", false); + tests.add(new Object[]{root.alleles(Arrays.asList(threeCopies, zeroCopies, oneCopies)).stop(25).make(), + Arrays.asList( + root.alleles(Arrays.asList(threeCopies, zeroCopies)).stop(25).make(), + root.alleles(Arrays.asList(twoCopies, zeroCopies)).stop(20).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics") + public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vc); + Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + final VariantContext expected = expectedBiallelics.get(i); + assertVariantContextsAreEqual(actual, expected); + } + } + + @Test(enabled = !DEBUG, dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") + public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List genotypes = new ArrayList(); + + int sampleI = 0; + for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { + genotypes.add(GenotypeBuilder.create("sample" + sampleI++, alleles)); + } + genotypes.add(GenotypeBuilder.createMissing("missing", 2)); + + final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); + + final List biallelics = GATKVariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples + + for ( final Genotype inputGenotype : genotypes ) { + final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); + Assert.assertNotNull(actualGenotype); + if ( ! vc.isVariant() || vc.isBiallelic() ) + Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); + else + Assert.assertTrue(actualGenotype.isNoCall()); + } + } + } + + // -------------------------------------------------------------------------------- + // + // Test repeats + // + // -------------------------------------------------------------------------------- + + private class RepeatDetectorTest extends TestDataProvider { + String ref; + boolean isTrueRepeat; + VariantContext vc; + + private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { + super(RepeatDetectorTest.class); + this.isTrueRepeat = isTrueRepeat; + this.ref = ref; + + List alleles = new LinkedList(); + final Allele refAllele = Allele.create(refAlleleString, true); + alleles.add(refAllele); + for ( final String altString: altAlleleStrings) { + final Allele alt = Allele.create(altString, false); + alleles.add(alt); + } + + VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); + this.vc = builder.make(); + } + + public String toString() { + return String.format("%s refBases=%s trueRepeat=%b vc=%s", super.toString(), ref, isTrueRepeat, vc); + } + } + + @DataProvider(name = "RepeatDetectorTest") + public Object[][] makeRepeatDetectorTest() { + new RepeatDetectorTest(true, "NAAC", "N", "NA"); + new RepeatDetectorTest(true, "NAAC", "NA", "N"); + new RepeatDetectorTest(false, "NAAC", "NAA", "N"); + new RepeatDetectorTest(false, "NAAC", "N", "NC"); + new RepeatDetectorTest(false, "AAC", "A", "C"); + + // running out of ref bases => false + new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); + + // complex repeats + new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); + + // multi-allelic + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false + + return RepeatDetectorTest.getTests(RepeatDetectorTest.class); + } + + @Test(enabled = !DEBUG, dataProvider = "RepeatDetectorTest") + public void testRepeatDetectorTest(RepeatDetectorTest cfg) { + + // test alleles are equal + Assert.assertEquals(GATKVariantContextUtils.isTandemRepeat(cfg.vc, cfg.ref.getBytes()), cfg.isTrueRepeat); + } + + @Test(enabled = !DEBUG) + public void testRepeatAllele() { + Allele nullR = Allele.create("A", true); + Allele nullA = Allele.create("A", false); + Allele atc = Allele.create("AATC", false); + Allele atcatc = Allele.create("AATCATC", false); + Allele ccccR = Allele.create("ACCCC", true); + Allele cc = Allele.create("ACC", false); + Allele cccccc = Allele.create("ACCCCCC", false); + Allele gagaR = Allele.create("AGAGA", true); + Allele gagagaga = Allele.create("AGAGAGAGA", false); + + // - / ATC [ref] from 20-22 + String delLoc = "chr1"; + int delLocStart = 20; + int delLocStop = 22; + + // - [ref] / ATC from 20-20 + String insLoc = "chr1"; + int insLocStart = 20; + int insLocStop = 20; + + Pair,byte[]> result; + byte[] refBytes = "TATCATCATCGGA".getBytes(); + + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("ATG".getBytes(), "ATGATGATGATG".getBytes(), true),4); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("G".getBytes(), "ATGATGATGATG".getBytes(), true),0); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("T".getBytes(), "T".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("AT".getBytes(), "ATGATGATCATG".getBytes(), true),1); + Assert.assertEquals(GATKVariantContextUtils.findNumberOfRepetitions("CCC".getBytes(), "CCCCCCCC".getBytes(), true),2); + + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("ATG".getBytes()),3); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AAA".getBytes()),1); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACAC".getBytes()),7); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CACACA".getBytes()),2); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("CATGCATG".getBytes()),4); + Assert.assertEquals(GATKVariantContextUtils.findRepeatedSubstring("AATAATA".getBytes()),7); + + + // A*,ATC, context = ATC ATC ATC : (ATC)3 -> (ATC)4 + VariantContext vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStop, Arrays.asList(nullR,atc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,3); + + // ATC*,A,ATCATC + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+3, Arrays.asList(Allele.create("AATC", true),nullA,atcatc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],3); + Assert.assertEquals(result.getFirst().toArray()[1],2); + Assert.assertEquals(result.getFirst().toArray()[2],4); + Assert.assertEquals(result.getSecond().length,3); + + // simple non-tandem deletion: CCCC*, - + refBytes = "TCCCCCCCCATG".getBytes(); + vc = new VariantContextBuilder("foo", delLoc, 10, 14, Arrays.asList(ccccR,nullA)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],8); + Assert.assertEquals(result.getFirst().toArray()[1],4); + Assert.assertEquals(result.getSecond().length,1); + + // CCCC*,CC,-,CCCCCC, context = CCC: (C)7 -> (C)5,(C)3,(C)9 + refBytes = "TCCCCCCCAGAGAGAG".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(ccccR,cc, nullA,cccccc)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],7); + Assert.assertEquals(result.getFirst().toArray()[1],5); + Assert.assertEquals(result.getFirst().toArray()[2],3); + Assert.assertEquals(result.getFirst().toArray()[3],9); + Assert.assertEquals(result.getSecond().length,1); + + // GAGA*,-,GAGAGAGA + refBytes = "TGAGAGAGAGATTT".getBytes(); + vc = new VariantContextBuilder("foo", insLoc, insLocStart, insLocStart+4, Arrays.asList(gagaR, nullA,gagagaga)).make(); + result = GATKVariantContextUtils.getNumTandemRepeatUnits(vc, refBytes); + Assert.assertEquals(result.getFirst().toArray()[0],5); + Assert.assertEquals(result.getFirst().toArray()[1],3); + Assert.assertEquals(result.getFirst().toArray()[2],7); + Assert.assertEquals(result.getSecond().length,2); + + } + + // -------------------------------------------------------------------------------- + // + // test forward clipping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ForwardClippingData") + public Object[][] makeForwardClippingData() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList("A"), -1}); + tests.add(new Object[]{Arrays.asList(""), -1}); + tests.add(new Object[]{Arrays.asList("A", "C"), -1}); + tests.add(new Object[]{Arrays.asList("AC", "C"), -1}); + tests.add(new Object[]{Arrays.asList("A", "G"), -1}); + tests.add(new Object[]{Arrays.asList("A", "T"), -1}); + tests.add(new Object[]{Arrays.asList("GT", "CA"), -1}); + tests.add(new Object[]{Arrays.asList("GT", "CT"), -1}); + tests.add(new Object[]{Arrays.asList("ACC", "AC"), 0}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), 1}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), 2}); + tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), 0}); + tests.add(new Object[]{Arrays.asList("A", ""), -1}); + for ( int len = 0; len < 50; len++ ) + tests.add(new Object[]{Arrays.asList("A" + new String(Utils.dupBytes((byte)'C', len)), "C"), -1}); + + tests.add(new Object[]{Arrays.asList("A", "T", "C"), -1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), 0}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "A"), -1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("AC", "AC", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), 0}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), 1}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), 1}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "ForwardClippingData") + public void testForwardClipping(final List alleleStrings, final int expectedClip) { + final List alleles = new LinkedList(); + for ( final String alleleString : alleleStrings ) + alleles.add(Allele.create(alleleString)); + + for ( final List myAlleles : Utils.makePermutations(alleles, alleles.size(), false)) { + final int actual = GATKVariantContextUtils.computeForwardClipping(myAlleles); + Assert.assertEquals(actual, expectedClip); + } + } + + @DataProvider(name = "ClipAlleleTest") + public Object[][] makeClipAlleleTest() { + List tests = new ArrayList(); + + // this functionality can be adapted to provide input data for whatever you might want in your data + tests.add(new Object[]{Arrays.asList("ACC", "AC"), Arrays.asList("AC", "A"), 0}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACG"), Arrays.asList("GC", "G"), 2}); + tests.add(new Object[]{Arrays.asList("ACGC", "ACGA"), Arrays.asList("C", "A"), 3}); + tests.add(new Object[]{Arrays.asList("ACGC", "AGC"), Arrays.asList("AC", "A"), 0}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "AG"), Arrays.asList("T", "C", "G"), 1}); + tests.add(new Object[]{Arrays.asList("AT", "AC", "ACG"), Arrays.asList("T", "C", "CG"), 1}); + tests.add(new Object[]{Arrays.asList("AC", "ACT", "ACG"), Arrays.asList("C", "CT", "CG"), 1}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGTA"), Arrays.asList("G", "GT", "GTA"), 2}); + tests.add(new Object[]{Arrays.asList("ACG", "ACGT", "ACGCA"), Arrays.asList("G", "GT", "GCA"), 2}); + + // trims from left and right + tests.add(new Object[]{Arrays.asList("ACGTT", "ACCTT"), Arrays.asList("G", "C"), 2}); + tests.add(new Object[]{Arrays.asList("ACGTT", "ACCCTT"), Arrays.asList("G", "CC"), 2}); + tests.add(new Object[]{Arrays.asList("ACGTT", "ACGCTT"), Arrays.asList("G", "GC"), 2}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "ClipAlleleTest") + public void testClipAlleles(final List alleleStrings, final List expected, final int numLeftClipped) { + final int start = 10; + final VariantContext unclipped = GATKVariantContextUtils.makeFromAlleles("test", "20", start, alleleStrings); + final VariantContext clipped = GATKVariantContextUtils.trimAlleles(unclipped, true, true); + + Assert.assertEquals(clipped.getStart(), unclipped.getStart() + numLeftClipped); + for ( int i = 0; i < unclipped.getAlleles().size(); i++ ) { + final Allele trimmed = clipped.getAlleles().get(i); + Assert.assertEquals(trimmed.getBaseString(), expected.get(i)); + } + } + + // -------------------------------------------------------------------------------- + // + // test primitive allele splitting + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "PrimitiveAlleleSplittingData") + public Object[][] makePrimitiveAlleleSplittingData() { + List tests = new ArrayList<>(); + + // no split + tests.add(new Object[]{"A", "C", 0, null}); + tests.add(new Object[]{"A", "AC", 0, null}); + tests.add(new Object[]{"AC", "A", 0, null}); + + // one split + tests.add(new Object[]{"ACA", "GCA", 1, Arrays.asList(0)}); + tests.add(new Object[]{"ACA", "AGA", 1, Arrays.asList(1)}); + tests.add(new Object[]{"ACA", "ACG", 1, Arrays.asList(2)}); + + // two splits + tests.add(new Object[]{"ACA", "GGA", 2, Arrays.asList(0, 1)}); + tests.add(new Object[]{"ACA", "GCG", 2, Arrays.asList(0, 2)}); + tests.add(new Object[]{"ACA", "AGG", 2, Arrays.asList(1, 2)}); + + // three splits + tests.add(new Object[]{"ACA", "GGG", 3, Arrays.asList(0, 1, 2)}); + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "PrimitiveAlleleSplittingData") + public void testPrimitiveAlleleSplitting(final String ref, final String alt, final int expectedSplit, final List variantPositions) { + + final int start = 10; + final VariantContext vc = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); + + final List result = GATKVariantContextUtils.splitIntoPrimitiveAlleles(vc); + + if ( expectedSplit > 0 ) { + Assert.assertEquals(result.size(), expectedSplit); + for ( int i = 0; i < variantPositions.size(); i++ ) { + Assert.assertEquals(result.get(i).getStart(), start + variantPositions.get(i)); + } + } else { + Assert.assertEquals(result.size(), 1); + Assert.assertEquals(vc, result.get(0)); + } + } + + // -------------------------------------------------------------------------------- + // + // test allele remapping + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "AlleleRemappingData") + public Object[][] makeAlleleRemappingData() { + List tests = new ArrayList<>(); + + final Allele originalBase1 = Allele.create((byte)'A'); + final Allele originalBase2 = Allele.create((byte)'T'); + + for ( final byte base1 : BaseUtils.BASES ) { + for ( final byte base2 : BaseUtils.BASES ) { + for ( final int numGenotypes : Arrays.asList(0, 1, 2, 5) ) { + Map map = new HashMap<>(2); + map.put(originalBase1, Allele.create(base1)); + map.put(originalBase2, Allele.create(base2)); + + tests.add(new Object[]{map, numGenotypes}); + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "AlleleRemappingData") + public void testAlleleRemapping(final Map alleleMap, final int numGenotypes) { + + final GATKVariantContextUtils.AlleleMapper alleleMapper = new GATKVariantContextUtils.AlleleMapper(alleleMap); + + final GenotypesContext originalGC = createGenotypesContext(numGenotypes, new ArrayList(alleleMap.keySet())); + + final GenotypesContext remappedGC = GATKVariantContextUtils.updateGenotypesWithMappedAlleles(originalGC, alleleMapper); + + for ( int i = 0; i < numGenotypes; i++ ) { + final Genotype originalG = originalGC.get(String.format("%d", i)); + final Genotype remappedG = remappedGC.get(String.format("%d", i)); + + Assert.assertEquals(originalG.getAlleles().size(), remappedG.getAlleles().size()); + for ( int j = 0; j < originalG.getAlleles().size(); j++ ) + Assert.assertEquals(remappedG.getAllele(j), alleleMap.get(originalG.getAllele(j))); + } + } + + private static GenotypesContext createGenotypesContext(final int numGenotypes, final List alleles) { + Utils.resetRandomGenerator(); + final Random random = Utils.getRandomGenerator(); + + final GenotypesContext gc = GenotypesContext.create(); + for ( int i = 0; i < numGenotypes; i++ ) { + // choose alleles at random + final List myAlleles = new ArrayList(); + myAlleles.add(alleles.get(random.nextInt(2))); + myAlleles.add(alleles.get(random.nextInt(2))); + + final Genotype g = new GenotypeBuilder(String.format("%d", i)).alleles(myAlleles).make(); + gc.add(g); + } + + return gc; + } + + // -------------------------------------------------------------------------------- + // + // Test subsetDiploidAlleles + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "subsetDiploidAllelesData") + public Object[][] makesubsetDiploidAllelesData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(50).make(); + + // make sure we don't screw up the simple case + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), AC, Arrays.asList(new GenotypeBuilder(aaGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), AC, Arrays.asList(new GenotypeBuilder(acGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), AC, Arrays.asList(new GenotypeBuilder(ccGT).make())}); + + // uninformative test case + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).PL(uninformative).GQ(0).make(); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), AC, Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homRef3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(AC).PL(new double[]{-10, 0, -20}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homC3AllelesPL).make()).make(), + AC, + Arrays.asList(new GenotypeBuilder(base).alleles(CC).PL(new double[]{-20, -10, 0}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetRefG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AG).PL(new double[]{-20, 0, -50}).GQ(200).make())}); + + // wow, scary -- bad output but discussed with Eric and we think this is the only thing that can be done + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(hetCG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).GQ(200).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).PL(homG3AllelesPL).make()).make(), + AG, + Arrays.asList(new GenotypeBuilder(base).alleles(GG).PL(new double[]{-20, -40, 0}).GQ(200).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "subsetDiploidAllelesData") + public void testsubsetDiploidAllelesData(final VariantContext inputVC, + final List allelesToUse, + final List expectedGenotypes) { + final GenotypesContext actual = GATKVariantContextUtils.subsetDiploidAlleles(inputVC, allelesToUse, GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } + + @DataProvider(name = "UpdateGenotypeAfterSubsettingData") + public Object[][] makeUpdateGenotypeAfterSubsettingData() { + List tests = new ArrayList(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + final List> allSubsetAlleles = Arrays.asList(AC,AG,ACG); + + final double[] homRefPL = new double[]{0.9, 0.09, 0.01}; + final double[] hetPL = new double[]{0.09, 0.9, 0.01}; + final double[] homVarPL = new double[]{0.01, 0.09, 0.9}; + final double[] uninformative = new double[]{0.33, 0.33, 0.33}; + final List allPLs = Arrays.asList(homRefPL, hetPL, homVarPL, uninformative); + + for ( final List alleles : allSubsetAlleles ) { + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.SET_TO_NO_CALL, pls, AA, alleles, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + } + + for ( final List originalGT : Arrays.asList(AA, AC, CC, AG, CG, GG) ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homRefPL, originalGT, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, hetPL, originalGT, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, homVarPL, originalGT, AC, CC}); +// tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, uninformative, AA, AC, GATKVariantContextUtils.NO_CALL_ALLELES}); + } + + for ( final double[] pls : allPLs ) { + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AC, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AC, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AC, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AC, AC}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, AG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, AG, AG}); + + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AA, ACG, AA}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AC, ACG, AC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CC, ACG, CC}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, AG, ACG, AG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, CG, ACG, CG}); + tests.add(new Object[]{GATKVariantContextUtils.GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, pls, GG, ACG, GG}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = !DEBUG, dataProvider = "UpdateGenotypeAfterSubsettingData") + public void testUpdateGenotypeAfterSubsetting(final GATKVariantContextUtils.GenotypeAssignmentMethod mode, + final double[] likelihoods, + final List originalGT, + final List allelesToUse, + final List expectedAlleles) { + final GenotypeBuilder gb = new GenotypeBuilder("test"); + final double[] log10Likelhoods = MathUtils.normalizeFromLog10(likelihoods, true, false); + GATKVariantContextUtils.updateGenotypeAfterSubsetting(originalGT, gb, mode, log10Likelhoods, allelesToUse); + final Genotype g = gb.make(); + Assert.assertEquals(new HashSet<>(g.getAlleles()), new HashSet<>(expectedAlleles)); + } + + @Test(enabled = !DEBUG) + public void testSubsetToRef() { + final Map tests = new LinkedHashMap<>(); + + for ( final List alleles : Arrays.asList(Arrays.asList(Aref), Arrays.asList(C), Arrays.asList(Aref, C), Arrays.asList(Aref, C, C) ) ) { + for ( final String name : Arrays.asList("test1", "test2") ) { + final GenotypeBuilder builder = new GenotypeBuilder(name, alleles); + builder.DP(10); + builder.GQ(30); + builder.AD(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1, 2} : new int[]{1, 2, 3})); + builder.PL(alleles.size() == 1 ? new int[]{1} : (alleles.size() == 2 ? new int[]{1,2} : new int[]{1,2,3})); + final List refs = Collections.nCopies(alleles.size(), Aref); + tests.put(builder.make(), builder.alleles(refs).noAD().noPL().make()); + } + } + + for ( final int n : Arrays.asList(1, 2, 3) ) { + for ( final List genotypes : Utils.makePermutations(new ArrayList<>(tests.keySet()), n, false) ) { + final VariantContext vc = new VariantContextBuilder("test", "20", 1, 1, Arrays.asList(Aref, C)).genotypes(genotypes).make(); + final GenotypesContext gc = GATKVariantContextUtils.subsetToRefOnly(vc, 2); + + Assert.assertEquals(gc.size(), genotypes.size()); + for ( int i = 0; i < genotypes.size(); i++ ) { +// logger.warn("Testing " + genotypes.get(i) + " => " + gc.get(i) + " " + tests.get(genotypes.get(i))); + assertGenotypesAreEqual(gc.get(i), tests.get(genotypes.get(i))); + } + } + } + } + + // -------------------------------------------------------------------------------- + // + // Test updatePLsAndAD + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "updatePLsAndADData") + public Object[][] makeUpdatePLsAndADData() { + List tests = new ArrayList<>(); + + final Allele A = Allele.create("A", true); + final Allele C = Allele.create("C"); + final Allele G = Allele.create("G"); + + final List AA = Arrays.asList(A,A); + final List AC = Arrays.asList(A,C); + final List CC = Arrays.asList(C,C); + final List AG = Arrays.asList(A,G); + final List CG = Arrays.asList(C,G); + final List GG = Arrays.asList(G,G); + final List ACG = Arrays.asList(A,C,G); + + final VariantContext vcBase = new VariantContextBuilder("test", "20", 10, 10, AC).make(); + + final double[] homRefPL = MathUtils.normalizeFromRealSpace(new double[]{0.9, 0.09, 0.01}); + final double[] hetPL = MathUtils.normalizeFromRealSpace(new double[]{0.09, 0.9, 0.01}); + final double[] homVarPL = MathUtils.normalizeFromRealSpace(new double[]{0.01, 0.09, 0.9}); + final double[] uninformative = new double[]{0, 0, 0}; + + final Genotype base = new GenotypeBuilder("NA12878").DP(10).GQ(100).make(); + + // make sure we don't screw up the simple case where no selection happens + final Genotype aaGT = new GenotypeBuilder(base).alleles(AA).AD(new int[]{10,2}).PL(homRefPL).GQ(8).make(); + final Genotype acGT = new GenotypeBuilder(base).alleles(AC).AD(new int[]{10,2}).PL(hetPL).GQ(8).make(); + final Genotype ccGT = new GenotypeBuilder(base).alleles(CC).AD(new int[]{10,2}).PL(homVarPL).GQ(8).make(); + + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(aaGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(aaGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(acGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(acGT).make())}); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(ccGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(new GenotypeBuilder(ccGT).make())}); + + // uninformative test cases + final Genotype uninformativeGT = new GenotypeBuilder(base).alleles(CC).noAD().PL(uninformative).GQ(0).make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(uninformativeGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(uninformativeGT)}); + final Genotype emptyGT = new GenotypeBuilder(base).alleles(GATKVariantContextUtils.NO_CALL_ALLELES).noAD().noPL().noGQ().make(); + tests.add(new Object[]{new VariantContextBuilder(vcBase).genotypes(emptyGT).make(), new VariantContextBuilder(vcBase).alleles(AC).make(), Arrays.asList(emptyGT)}); + + // actually subsetting down from multiple alt values + final double[] homRef3AllelesPL = new double[]{0, -10, -20, -30, -40, -50}; + final double[] hetRefC3AllelesPL = new double[]{-10, 0, -20, -30, -40, -50}; + final double[] homC3AllelesPL = new double[]{-20, -10, 0, -30, -40, -50}; + final double[] hetRefG3AllelesPL = new double[]{-20, -10, -30, 0, -40, -50}; + final double[] hetCG3AllelesPL = new double[]{-20, -10, -30, -40, 0, -50}; // AA, AC, CC, AG, CG, GG + final double[] homG3AllelesPL = new double[]{-20, -10, -30, -40, -50, 0}; // AA, AC, CC, AG, CG, GG + + final int[] homRef3AllelesAD = new int[]{20, 0, 1}; + final int[] hetRefC3AllelesAD = new int[]{10, 10, 1}; + final int[] homC3AllelesAD = new int[]{0, 20, 1}; + final int[] hetRefG3AllelesAD = new int[]{10, 0, 11}; + final int[] hetCG3AllelesAD = new int[]{0, 12, 11}; // AA, AC, CC, AG, CG, GG + final int[] homG3AllelesAD = new int[]{0, 1, 21}; // AA, AC, CC, AG, CG, GG + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homRef3AllelesAD).PL(homRef3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -10, -20}).AD(new int[]{20, 0}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefC3AllelesAD).PL(hetRefC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-10, 0, -20}).AD(new int[]{10, 10}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homC3AllelesAD).PL(homC3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AC).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -10, 0}).AD(new int[]{0, 20}).GQ(100).make())}); + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetRefG3AllelesAD).PL(hetRefG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, 0, -50}).AD(new int[]{10, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(hetCG3AllelesAD).PL(hetCG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{0, -20, -30}).AD(new int[]{0, 11}).GQ(100).make())}); + + tests.add(new Object[]{ + new VariantContextBuilder(vcBase).alleles(ACG).genotypes(new GenotypeBuilder(base).alleles(AA).AD(homG3AllelesAD).PL(homG3AllelesPL).make()).make(), + new VariantContextBuilder(vcBase).alleles(AG).make(), + Arrays.asList(new GenotypeBuilder(base).alleles(AA).PL(new double[]{-20, -40, 0}).AD(new int[]{0, 21}).GQ(100).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "updatePLsAndADData") + public void testUpdatePLsAndADData(final VariantContext originalVC, + final VariantContext selectedVC, + final List expectedGenotypes) { + final VariantContext selectedVCwithGTs = new VariantContextBuilder(selectedVC).genotypes(originalVC.getGenotypes()).make(); + final GenotypesContext actual = GATKVariantContextUtils.updatePLsAndAD(selectedVCwithGTs, originalVC); + + Assert.assertEquals(actual.size(), expectedGenotypes.size()); + for ( final Genotype expected : expectedGenotypes ) { + final Genotype actualGT = actual.get(expected.getSampleName()); + Assert.assertNotNull(actualGT); + assertGenotypesAreEqual(actualGT, expected); + } + } + + // -------------------------------------------------------------------------------- + // + // Test methods for merging reference confidence VCs + // + // -------------------------------------------------------------------------------- + + + @Test(dataProvider = "indexOfAlleleData") + public void testIndexOfAllele(final Allele reference, final List altAlleles, final List otherAlleles) { + final List alleles = new ArrayList<>(altAlleles.size() + 1); + alleles.add(reference); + alleles.addAll(altAlleles); + final VariantContext vc = makeVC("Source", alleles); + + for (int i = 0; i < alleles.size(); i++) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,true,false),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i),true),true,true,false),-1); + if (i == 0) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,alleles.get(i),false,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),true),false,true,true),i); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,Allele.create(alleles.get(i).getBases(),false),false,true,true),-1); + } else { + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),true),i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,alleles.get(i),false), i - 1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),true),i-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAltAllele(vc,Allele.create(alleles.get(i),true),false),-1); + } + } + + for (final Allele other : otherAlleles) { + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, true, true, true), -1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,true,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,false,false,true),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc,other,true,false,false),-1); + Assert.assertEquals(GATKVariantContextUtils.indexOfAllele(vc, other, false, false, false),-1); + } + } + + @DataProvider(name = "indexOfAlleleData") + public Iterator indexOfAlleleData() { + + final Allele[] ALTERNATIVE_ALLELES = new Allele[] { T, C, G, ATC, ATCATC}; + + final int lastMask = 0x1F; + + return new Iterator() { + + int nextMask = 0; + + @Override + public boolean hasNext() { + return nextMask <= lastMask; + } + + @Override + public Object[] next() { + + int mask = nextMask++; + final List includedAlleles = new ArrayList<>(5); + final List excludedAlleles = new ArrayList<>(5); + for (int i = 0; i < ALTERNATIVE_ALLELES.length; i++) { + ((mask & 1) == 1 ? includedAlleles : excludedAlleles).add(ALTERNATIVE_ALLELES[i]); + mask >>= 1; + } + return new Object[] { Aref , includedAlleles, excludedAlleles}; + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @Test(dataProvider="overlapWithData") + public void testOverlapsWith(final VariantContext vc, final GenomeLoc genomeLoc) { + final boolean expected; + + if (genomeLoc.isUnmapped()) + expected = false; + else if (vc.getStart() > genomeLoc.getStop()) + expected = false; + else if (vc.getEnd() < genomeLoc.getStart()) + expected = false; + else if (!vc.getChr().equals(genomeLoc.getContig())) + expected = false; + else + expected = true; + + Assert.assertEquals(GATKVariantContextUtils.overlapsRegion(vc, genomeLoc), expected); + } + + + private final String[] OVERLAP_WITH_CHROMOSOMES = { "chr1", "chr20" }; + private final int[] OVERLAP_WITH_EVENT_SIZES = { -10, -1, 0, 1, 10 }; // 0 == SNP , -X xbp deletion, +X xbp insertion. + private final int[] OVERLAP_WITH_EVENT_STARTS = { 10000000, 10000001, + 10000005, 10000010, + 10000009, 10000011, + 20000000 }; + + @DataProvider(name="overlapWithData") + public Object[][] overlapWithData() { + + final int totalLocations = OVERLAP_WITH_CHROMOSOMES.length * OVERLAP_WITH_EVENT_SIZES.length * OVERLAP_WITH_EVENT_STARTS.length + 1; + final int totalEvents = OVERLAP_WITH_CHROMOSOMES.length * OVERLAP_WITH_EVENT_SIZES.length * OVERLAP_WITH_EVENT_STARTS.length; + final GenomeLoc[] locs = new GenomeLoc[totalLocations]; + final VariantContext[] events = new VariantContext[totalEvents]; + + generateAllLocationsAndVariantContextCombinations(OVERLAP_WITH_CHROMOSOMES, OVERLAP_WITH_EVENT_SIZES, + OVERLAP_WITH_EVENT_STARTS, locs, events); + + return generateAllParameterCombinationsForOverlapWithData(locs, events); + } + + private Object[][] generateAllParameterCombinationsForOverlapWithData(GenomeLoc[] locs, VariantContext[] events) { + final List result = new LinkedList<>(); + for (final GenomeLoc loc : locs) + for (final VariantContext event : events) + result.add(new Object[] { event , loc }); + + return result.toArray(new Object[result.size()][]); + } + + private void generateAllLocationsAndVariantContextCombinations(final String[] chrs, final int[] eventSizes, + final int[] eventStarts, final GenomeLoc[] locs, + final VariantContext[] events) { + int nextIndex = 0; + for (final String chr : chrs ) + for (final int size : eventSizes ) + for (final int starts : eventStarts ) { + locs[nextIndex] = genomeLocParser.createGenomeLoc(chr,starts,starts + Math.max(0,size)); + events[nextIndex++] = new VariantContextBuilder().source("test").loc(chr,starts,starts + Math.max(0,size)).alleles(Arrays.asList( + Allele.create(randomBases(size <= 0 ? 1 : size + 1, true), true), Allele.create(randomBases(size < 0 ? -size + 1 : 1, false), false))).make(); + } + + locs[nextIndex++] = GenomeLoc.UNMAPPED; + } + + @Test(dataProvider = "totalPloidyData") + public void testTotalPloidy(final int[] ploidies, final int defaultPloidy, final int expected) { + final Genotype[] genotypes = new Genotype[ploidies.length]; + final List vcAlleles = Arrays.asList(Aref,C); + for (int i = 0; i < genotypes.length; i++) + genotypes[i] = new GenotypeBuilder().alleles(GATKVariantContextUtils.noCallAlleles(ploidies[i])).make(); + final VariantContext vc = new VariantContextBuilder().chr("seq1").genotypes(genotypes).alleles(vcAlleles).make(); + Assert.assertEquals(GATKVariantContextUtils.totalPloidy(vc,defaultPloidy),expected," " + defaultPloidy + " " + Arrays.toString(ploidies)); + } + + @DataProvider(name="totalPloidyData") + public Object[][] totalPloidyData() { + final Random rdn = Utils.getRandomGenerator(); + final List resultList = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + final int sampleCount = rdn.nextInt(10); + + int expected = 0; + final int defaultPloidy = rdn.nextInt(10) + 1; + final int[] plodies = new int[sampleCount]; + for (int j = 0; j < sampleCount; j++) { + plodies[j] = rdn.nextInt(10); + expected += plodies[j] == 0 ? defaultPloidy : plodies[j]; + } + resultList.add(new Object[] { plodies, defaultPloidy, expected }); + } + return resultList.toArray(new Object[100][]); + } + + private byte[] randomBases(final int length, final boolean reference) { + final byte[] bases = new byte[length]; + bases[0] = (byte) (reference ? 'A' : 'C'); + BaseUtils.fillWithRandomBases(bases, 1, bases.length); + return bases; + } +} + diff --git a/public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextBenchmark.java b/public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextBenchmark.java similarity index 100% rename from public/gatk-tools-public/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextBenchmark.java rename to public/gatk-utils/src/test/java/org/broadinstitute/gatk/utils/variant/VariantContextBenchmark.java diff --git a/public/gatk-engine/src/test/resources/exampleBAM.bam b/public/gatk-utils/src/test/resources/exampleBAM.bam similarity index 100% rename from public/gatk-engine/src/test/resources/exampleBAM.bam rename to public/gatk-utils/src/test/resources/exampleBAM.bam diff --git a/public/gatk-engine/src/test/resources/exampleBAM.bam.bai b/public/gatk-utils/src/test/resources/exampleBAM.bam.bai similarity index 100% rename from public/gatk-engine/src/test/resources/exampleBAM.bam.bai rename to public/gatk-utils/src/test/resources/exampleBAM.bam.bai diff --git a/public/gatk-engine/src/test/resources/exampleBAM.simple.bai b/public/gatk-utils/src/test/resources/exampleBAM.simple.bai similarity index 100% rename from public/gatk-engine/src/test/resources/exampleBAM.simple.bai rename to public/gatk-utils/src/test/resources/exampleBAM.simple.bai diff --git a/public/gatk-engine/src/test/resources/exampleBAM.simple.bam b/public/gatk-utils/src/test/resources/exampleBAM.simple.bam similarity index 100% rename from public/gatk-engine/src/test/resources/exampleBAM.simple.bam rename to public/gatk-utils/src/test/resources/exampleBAM.simple.bam diff --git a/public/gatk-engine/src/test/resources/exampleDBSNP.vcf b/public/gatk-utils/src/test/resources/exampleDBSNP.vcf similarity index 100% rename from public/gatk-engine/src/test/resources/exampleDBSNP.vcf rename to public/gatk-utils/src/test/resources/exampleDBSNP.vcf diff --git a/public/gatk-engine/src/test/resources/exampleDBSNP.vcf.idx b/public/gatk-utils/src/test/resources/exampleDBSNP.vcf.idx similarity index 100% rename from public/gatk-engine/src/test/resources/exampleDBSNP.vcf.idx rename to public/gatk-utils/src/test/resources/exampleDBSNP.vcf.idx diff --git a/public/gatk-engine/src/test/resources/exampleFASTA-3contigs.fasta b/public/gatk-utils/src/test/resources/exampleFASTA-3contigs.fasta similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA-3contigs.fasta rename to public/gatk-utils/src/test/resources/exampleFASTA-3contigs.fasta diff --git a/public/gatk-engine/src/test/resources/exampleFASTA-combined.fasta b/public/gatk-utils/src/test/resources/exampleFASTA-combined.fasta similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA-combined.fasta rename to public/gatk-utils/src/test/resources/exampleFASTA-combined.fasta diff --git a/public/gatk-engine/src/test/resources/exampleFASTA-windows.fasta b/public/gatk-utils/src/test/resources/exampleFASTA-windows.fasta similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA-windows.fasta rename to public/gatk-utils/src/test/resources/exampleFASTA-windows.fasta diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.dict b/public/gatk-utils/src/test/resources/exampleFASTA.dict similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.dict rename to public/gatk-utils/src/test/resources/exampleFASTA.dict diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta b/public/gatk-utils/src/test/resources/exampleFASTA.fasta similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.amb b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.amb similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.amb rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.amb diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.ann b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.ann similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.ann rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.ann diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.bwt b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.bwt similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.bwt rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.bwt diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.fai b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.fai similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.fai rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.fai diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.pac b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.pac similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.pac rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.pac diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.rbwt b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.rbwt similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.rbwt rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.rbwt diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.rpac b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.rpac similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.rpac rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.rpac diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.rsa b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.rsa similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.rsa rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.rsa diff --git a/public/gatk-engine/src/test/resources/exampleFASTA.fasta.sa b/public/gatk-utils/src/test/resources/exampleFASTA.fasta.sa similarity index 100% rename from public/gatk-engine/src/test/resources/exampleFASTA.fasta.sa rename to public/gatk-utils/src/test/resources/exampleFASTA.fasta.sa diff --git a/public/gatk-engine/src/test/resources/exampleGATKReport.eval b/public/gatk-utils/src/test/resources/exampleGATKReport.eval similarity index 100% rename from public/gatk-engine/src/test/resources/exampleGATKReport.eval rename to public/gatk-utils/src/test/resources/exampleGATKReport.eval diff --git a/public/gatk-engine/src/test/resources/exampleGATKReportv1.tbl b/public/gatk-utils/src/test/resources/exampleGATKReportv1.tbl similarity index 100% rename from public/gatk-engine/src/test/resources/exampleGATKReportv1.tbl rename to public/gatk-utils/src/test/resources/exampleGATKReportv1.tbl diff --git a/public/gatk-engine/src/test/resources/exampleGATKReportv2.tbl b/public/gatk-utils/src/test/resources/exampleGATKReportv2.tbl similarity index 100% rename from public/gatk-engine/src/test/resources/exampleGATKReportv2.tbl rename to public/gatk-utils/src/test/resources/exampleGATKReportv2.tbl diff --git a/public/gatk-engine/src/test/resources/exampleGRP.grp b/public/gatk-utils/src/test/resources/exampleGRP.grp similarity index 100% rename from public/gatk-engine/src/test/resources/exampleGRP.grp rename to public/gatk-utils/src/test/resources/exampleGRP.grp diff --git a/public/gatk-engine/src/test/resources/exampleINTERVAL.intervals b/public/gatk-utils/src/test/resources/exampleINTERVAL.intervals similarity index 100% rename from public/gatk-engine/src/test/resources/exampleINTERVAL.intervals rename to public/gatk-utils/src/test/resources/exampleINTERVAL.intervals diff --git a/public/gatk-engine/src/test/resources/exampleNORG.bam b/public/gatk-utils/src/test/resources/exampleNORG.bam similarity index 100% rename from public/gatk-engine/src/test/resources/exampleNORG.bam rename to public/gatk-utils/src/test/resources/exampleNORG.bam diff --git a/public/gatk-engine/src/test/resources/exampleNORG.bam.bai b/public/gatk-utils/src/test/resources/exampleNORG.bam.bai similarity index 100% rename from public/gatk-engine/src/test/resources/exampleNORG.bam.bai rename to public/gatk-utils/src/test/resources/exampleNORG.bam.bai diff --git a/public/gatk-engine/src/test/resources/forAlleleFractionSimulation.vcf b/public/gatk-utils/src/test/resources/forAlleleFractionSimulation.vcf similarity index 100% rename from public/gatk-engine/src/test/resources/forAlleleFractionSimulation.vcf rename to public/gatk-utils/src/test/resources/forAlleleFractionSimulation.vcf diff --git a/public/gatk-engine/src/test/resources/forAlleleFractionSimulation.vcf.idx b/public/gatk-utils/src/test/resources/forAlleleFractionSimulation.vcf.idx similarity index 100% rename from public/gatk-engine/src/test/resources/forAlleleFractionSimulation.vcf.idx rename to public/gatk-utils/src/test/resources/forAlleleFractionSimulation.vcf.idx diff --git a/public/gatk-engine/src/test/resources/forLongInsert.vcf b/public/gatk-utils/src/test/resources/forLongInsert.vcf similarity index 100% rename from public/gatk-engine/src/test/resources/forLongInsert.vcf rename to public/gatk-utils/src/test/resources/forLongInsert.vcf diff --git a/public/gatk-engine/src/test/resources/forLongInsert.vcf.idx b/public/gatk-utils/src/test/resources/forLongInsert.vcf.idx similarity index 100% rename from public/gatk-engine/src/test/resources/forLongInsert.vcf.idx rename to public/gatk-utils/src/test/resources/forLongInsert.vcf.idx diff --git a/public/gatk-engine/src/test/resources/forSimulation.vcf b/public/gatk-utils/src/test/resources/forSimulation.vcf similarity index 100% rename from public/gatk-engine/src/test/resources/forSimulation.vcf rename to public/gatk-utils/src/test/resources/forSimulation.vcf diff --git a/public/gatk-engine/src/test/resources/forSimulation.vcf.idx b/public/gatk-utils/src/test/resources/forSimulation.vcf.idx similarity index 100% rename from public/gatk-engine/src/test/resources/forSimulation.vcf.idx rename to public/gatk-utils/src/test/resources/forSimulation.vcf.idx diff --git a/public/gatk-tools-public/src/test/resources/testProperties.properties b/public/gatk-utils/src/test/resources/testProperties.properties similarity index 100% rename from public/gatk-tools-public/src/test/resources/testProperties.properties rename to public/gatk-utils/src/test/resources/testProperties.properties diff --git a/public/gatk-engine/src/test/resources/testfile.sam b/public/gatk-utils/src/test/resources/testfile.sam similarity index 100% rename from public/gatk-engine/src/test/resources/testfile.sam rename to public/gatk-utils/src/test/resources/testfile.sam diff --git a/public/package-tests/pom.xml b/public/package-tests/pom.xml index b4c5491b6..a4ce50873 100644 --- a/public/package-tests/pom.xml +++ b/public/package-tests/pom.xml @@ -35,7 +35,7 @@ - ${gatk.basedir}/public/gatk-tools-public/target/gatk-tools-public-${project.version}-tests.jar + ${gatk.basedir}/public/gatk-utils/target/gatk-utils-${project.version}-tests.jar + ${gatk.basedir}/public/gatk-engine/target/gatk-engine-${project.version}-tests.jar ${gatk.basedir}/public/gatk-queue/target/gatk-queue-${project.version}-tests.jar @@ -143,7 +144,8 @@ ${gatk.packagetests.testClasses} - ${gatk.basedir}/public/gatk-tools-public/target/gatk-tools-public-${project.version}-tests.jar + ${gatk.basedir}/public/gatk-utils/target/gatk-utils-${project.version}-tests.jar + ${gatk.basedir}/public/gatk-engine/target/gatk-engine-${project.version}-tests.jar ${gatk.basedir}/public/gatk-queue/target/gatk-queue-${project.version}-tests.jar diff --git a/public/src/main/scripts/shell/check_utils_engine_tools.sh b/public/src/main/scripts/shell/check_utils_engine_tools.sh new file mode 100755 index 000000000..97a723a6a --- /dev/null +++ b/public/src/main/scripts/shell/check_utils_engine_tools.sh @@ -0,0 +1,25 @@ +#!/bin/sh + +# Exit with an error if: +# - utils contains a reference to engine or tools +# - engine contains a reference to tools + +sh -c \ + "grep -Rn \ + -e 'org.broadinstitute.gatk.tools' \ + -e 'org.broadinstitute.gatk.engine' \ + */*/src/*/*/org/broadinstitute/gatk/utils | \ + grep -v dependencyanalyzer && \ + grep -Rn \ + -e 'org.broadinstitute.gatk.tools' \ + */*/src/*/*/org/broadinstitute/gatk/engine" | \ + sed -e 's/:/:'$'\x1B\x5B\x35\x6d''/2' -e 's/$/'$'\x1B\x5B\x6d''/' | \ + grep gatk + +RESULT=$? +if [[ ${RESULT} -eq 0 ]]; then + echo "Fix the above errors. Do not import tools nor engine into the utils, and do not import tools into the engine." >&2 + exit 1 +else + exit 0 +fi