From c163e6d0d2b57863facd1dd280cfaeca18e8fae1 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 27 Feb 2014 01:56:00 +0800 Subject: [PATCH 01/18] Separate failsafe directories for each of the integration test types [#66515572] --- pom.xml | 4 ---- public/package-tests/pom.xml | 2 -- public/sting-root/pom.xml | 11 +++++++++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index 3abf73151..96074d881 100644 --- a/pom.xml +++ b/pom.xml @@ -609,7 +609,6 @@ true ${sting.packageintegrationtests.skipped} - ${project.build.directory}/failsafe-reports/integration/failsafe-summary-${it.test}.xml @@ -628,7 +627,6 @@ true ${sting.packagepipelinetests.skipped} - ${project.build.directory}/failsafe-reports/pipeline/failsafe-summary-${it.test}.xml @@ -647,7 +645,6 @@ true ${sting.packagelargescaletests.skipped} - ${project.build.directory}/failsafe-reports/largescale/failsafe-summary-${it.test}.xml @@ -666,7 +663,6 @@ true ${sting.packageknowledgebasetests.skipped} - ${project.build.directory}/failsafe-reports/knowledgebase/failsafe-summary-${it.test}.xml diff --git a/public/package-tests/pom.xml b/public/package-tests/pom.xml index 817cfecdb..6d7e356cb 100644 --- a/public/package-tests/pom.xml +++ b/public/package-tests/pom.xml @@ -24,7 +24,6 @@ true true true - ${project.build.directory}/failsafe-reports/failsafe-summary.xml @@ -149,7 +148,6 @@ ${sting.packagetests.basedir} ${project.build.outputDirectory}/ignored_by_package_test ${sting.packagetests.testClasses} - ${failsafe.summaryFile} diff --git a/public/sting-root/pom.xml b/public/sting-root/pom.xml index 84edd9be5..279897f8e 100644 --- a/public/sting-root/pom.xml +++ b/public/sting-root/pom.xml @@ -353,7 +353,6 @@ usedefaultlisteners false - @@ -376,6 +375,7 @@ ${sting.unittests.skipped} + ${project.build.directory}/surefire-reports/unit/${test} **/*UnitTest.class @@ -399,7 +399,6 @@ usedefaultlisteners false - @@ -421,6 +420,8 @@ ${sting.integrationtests.skipped} + ${project.build.directory}/failsafe-reports/integration/${it.test} + ${project.build.directory}/failsafe-reports/integration/failsafe-summary-${it.test}.xml **/*IntegrationTest.class @@ -436,6 +437,8 @@ ${sting.pipelinetests.skipped} + ${project.build.directory}/failsafe-reports/pipeline/${it.test} + ${project.build.directory}/failsafe-reports/pipeline/failsafe-summary-${it.test}.xml **/*PipelineTest.class @@ -450,6 +453,8 @@ ${sting.largescaletests.skipped} + ${project.build.directory}/failsafe-reports/largescale/${it.test} + ${project.build.directory}/failsafe-reports/largescale/failsafe-summary-${it.test}.xml **/*LargeScaleTest.class @@ -464,6 +469,8 @@ ${sting.knowledgebasetests.skipped} + ${project.build.directory}/failsafe-reports/knowledgebasetests/${it.test} + ${project.build.directory}/failsafe-reports/knowledgebasetests/failsafe-summary-${it.test}.xml **/*KnowledgeBaseTest.class From da587d48ed6dd78171cd3130215b48ee3dad7d98 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Thu, 27 Feb 2014 04:43:29 +0800 Subject: [PATCH 02/18] Using absolute paths in generated diff commands, to ease running them from any directory. --- .../src/test/java/org/broadinstitute/sting/MD5DB.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java index 7bd6f7bc4..4c0f8b11a 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/MD5DB.java @@ -120,7 +120,7 @@ public class MD5DB { for ( String dir : Arrays.asList(GLOBAL_MD5_DB_DIR, LOCAL_MD5_DB_DIR)) { File f = getFileForMD5(md5, dir); if ( f.exists() && f.canRead() ) - return f.getPath(); + return f.getAbsolutePath(); } return valueIfNotFound; From e61ba8b3408eeedb9b3c7e5498f7ac5b8d9314bb Mon Sep 17 00:00:00 2001 From: Chris Whelan Date: Wed, 26 Feb 2014 11:40:01 -0500 Subject: [PATCH 03/18] Added command line checks for duplicate files in ROD lists -- Keep a list of processed files in ArgumentTypeDescriptor.getRodBindingsCollection -- Throw user exception if a file name duplicates one that was previously parsed -- Throw user exception if the ROD list is empty -- Added two unit tests to RodBindingCollectionUnitTest --- .../commandline/ArgumentTypeDescriptor.java | 54 ++++++++++++++++++- .../RodBindingCollectionUnitTest.java | 22 ++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index 8f0abe360..9ab317251 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -455,6 +455,8 @@ public abstract class ArgumentTypeDescriptor { /** * Retrieve and parse a collection of RodBindings from the given file. * + * If the file contains duplicate entries or is empty, an exception will be thrown. + * * @param file the source file * @param parsingEngine the engine responsible for parsing * @param parameterType the Tribble Feature parameter type @@ -471,6 +473,9 @@ public abstract class ArgumentTypeDescriptor { final String fieldName) throws IOException { final List bindings = new ArrayList<>(); + // Keep track of the files in this list so that we can check for duplicates and empty files + final Set fileValues = new HashSet<>(); + // parse each line separately using the given Tags if none are provided on each line for ( final String line: FileUtils.readLines(file) ) { final String[] tokens = line.split("\\s+"); @@ -481,14 +486,15 @@ public abstract class ArgumentTypeDescriptor { } // use the default tags if none are provided for this binding else if ( tokens.length == 1 ) { - final ArgumentMatchValue value = new ArgumentMatchStringValue(tokens[0]); + final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[0], fileValues, fieldName, file.getName()); binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, defaultTags, fieldName); parsingEngine.addTags(binding, defaultTags); + } // use the new tags if provided else if ( tokens.length == 2 ) { final Tags tags = ParsingMethod.parseTags(fieldName, tokens[0]); - final ArgumentMatchValue value = new ArgumentMatchStringValue(tokens[1]); + final ArgumentMatchValue value = parseAndValidateArgumentMatchValue(tokens[1], fileValues, fieldName, file.getName()); binding = (RodBinding)parseBinding(value, parameterType, RodBinding.class, bindingName, tags, fieldName); parsingEngine.addTags(binding, tags); } else { @@ -499,8 +505,52 @@ public abstract class ArgumentTypeDescriptor { parsingEngine.addRodBinding(binding); } + if (fileValues.isEmpty()) { + throw new UserException.BadArgumentValue(fieldName, "The input list " + file.getName() + " is empty."); + } + return RodBindingCollection.createRodBindingCollectionOfType(parameterType, bindings); } + + /** + * Validates the resource file name and constructs an ArgumentMatchValue from it. + * + * If the list name has already been processed in the current list, throws a UserException, otherwise + * creates an ArgumentMatchValue to represent the list. + * + * @param token Name of the ROD resource file. + * @param fileValues Set of names of ROD files that have already been processed. + * @param fieldName Name of the argument field being populated. + * @param listFileName Name of the list file being processed. + * @return + */ + private static ArgumentMatchValue parseAndValidateArgumentMatchValue(final String token, final Set fileValues, final String fieldName, + final String listFileName) { + checkForDuplicateFileName(token, fileValues, fieldName, listFileName); + return new ArgumentMatchStringValue(token); + } + + /** + * Checks to make sure that the current file name to be processed has not already been processed. + * + * Checks the name of the current file against the names that have already been processed, throwing + * an informative BadArgumentValue exception if it has already been seen. As a side effect adds the + * current file name to the set of filenames that have already been processed. + * + * @param currentFile Name of the current file to process + * @param processedFiles Set of file names that have already been processed + * @param fieldName Name of the argument that is being populated + * @param listName Filename of the list that is being processed + */ + protected static void checkForDuplicateFileName(final String currentFile, final Set processedFiles, + final String fieldName, final String listName) { + if (processedFiles.contains(currentFile)) { + throw new UserException.BadArgumentValue(fieldName, "The input list " + listName + " contains file " + currentFile + + " multiple times, which isn't allowed. If you are intentionally trying to " + + "include the same file more than once, you will need to specify it in separate file lists."); + } + processedFiles.add(currentFile); + } } /** diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java index 29d38ec19..853c51543 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/commandline/RodBindingCollectionUnitTest.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.commandline; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.variant.variantcontext.VariantContext; import org.testng.Assert; import org.testng.annotations.BeforeMethod; @@ -105,6 +106,27 @@ public class RodBindingCollectionUnitTest extends BaseTest { Assert.assertEquals(parsingEngine.getTags(binding), mytags); } + @Test(expectedExceptions = UserException.BadArgumentValue.class) + public void testDuplicateEntriesInFile() throws IOException { + + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.variantListWithDuplicates", ".list"); + testFile.deleteOnExit(); + final FileWriter writer = new FileWriter(testFile); + writer.write(testVCFFileName + "\n"); + writer.write(testVCFFileName + "\n"); + writer.close(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + } + + @Test(expectedExceptions = UserException.BadArgumentValue.class) + public void testValidateEmptyFile() throws IOException { + final File testFile = File.createTempFile("RodBindingCollectionUnitTest.emptyVCFList", ".list"); + testFile.deleteOnExit(); + + ArgumentTypeDescriptor.getRodBindingsCollection(testFile, parsingEngine, VariantContext.class, "foo", mytags, "input"); + } + @Test public void testOverrideTagsInFile() throws IOException { final File testFile = File.createTempFile("RodBindingCollectionUnitTest.overrideTags", ".list"); From 387188e5bbcf35a14ed8d5f1d8f3c6c062ecd250 Mon Sep 17 00:00:00 2001 From: Khalid Shakir Date: Sat, 1 Mar 2014 15:13:58 +0800 Subject: [PATCH 04/18] Attempting to limit gc during Maven tests, using defaults found in JavaCommandLineFunction --- public/sting-root/pom.xml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/public/sting-root/pom.xml b/public/sting-root/pom.xml index 279897f8e..171eb7620 100644 --- a/public/sting-root/pom.xml +++ b/public/sting-root/pom.xml @@ -37,7 +37,10 @@ false 1g 4g - -Xmx${test.maxmemory} + 4 + 50 + 10 + -Xmx${test.maxmemory} -XX:+UseParallelOldGC -XX:ParallelGCThreads=${java.gc.threads} -XX:GCTimeLimit=${java.gc.timeLimit} -XX:GCHeapFreeLimit=${java.gc.heapFreeLimit} 1.107.1683 From 22ad18b919fa972288a51a2f5038687bf00816d8 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 28 Feb 2014 02:05:02 -0500 Subject: [PATCH 06/18] Moving Reduce Reads to the archive. The GATK now fails with a user error if you try to run with a reduced bam. (I added a unit test for that; everything else here is just the removal of all traces of RR) --- .../gatk/walkers/annotator/Coverage.java | 8 +- .../annotator/DepthPerAlleleBySample.java | 6 +- .../walkers/annotator/DepthPerSampleHC.java | 5 +- .../gatk/walkers/annotator/FisherStrand.java | 26 +- .../walkers/annotator/RMSMappingQuality.java | 12 +- .../gatk/walkers/annotator/RankSumTest.java | 6 +- .../walkers/annotator/SpanningDeletions.java | 6 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 3 - .../reducereads/BaseAndQualsCounts.java | 207 --- .../compression/reducereads/BaseCounts.java | 411 ------ .../compression/reducereads/BaseIndex.java | 136 -- .../reducereads/CompressionStash.java | 107 -- .../compression/reducereads/Compressor.java | 108 -- .../reducereads/FinishedGenomeLoc.java | 82 -- .../reducereads/HeaderElement.java | 393 ------ .../reducereads/MultiSampleCompressor.java | 163 --- .../compression/reducereads/ReduceReads.java | 782 ------------ .../reducereads/ReduceReadsStash.java | 160 --- .../reducereads/SingleSampleCompressor.java | 153 --- .../reducereads/SlidingWindow.java | 1110 ----------------- .../reducereads/SyntheticRead.java | 369 ------ .../DiploidSNPGenotypeLikelihoods.java | 12 - .../gatk/walkers/genotyper/ErrorModel.java | 3 +- ...GeneralPloidyIndelGenotypeLikelihoods.java | 3 +- .../GenotypeLikelihoodsCalculationModel.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- .../genotyper/UnifiedGenotyperEngine.java | 2 +- .../PairHMMLikelihoodCalculationEngine.java | 4 +- .../haplotypecaller/ReadErrorCorrector.java | 4 - .../ReferenceConfidenceModel.java | 12 +- .../readthreading/HaplotypeGraph.java | 4 +- .../readthreading/ReadThreadingAssembler.java | 6 +- .../readthreading/ReadThreadingGraph.java | 47 +- .../readthreading/SequenceForKmers.java | 21 +- .../indels/PairHMMIndelErrorModel.java | 15 +- .../walkers/qc/AssessReducedCoverage.java | 175 --- .../gatk/walkers/qc/AssessReducedQuals.java | 208 --- .../sting/utils/recalibration/RecalUtils.java | 18 - .../walkers/annotator/RankSumUnitTest.java | 12 +- .../reducereads/BaseCountsUnitTest.java | 201 --- .../reducereads/HeaderElementUnitTest.java | 214 ---- .../ReduceReadsIntegrationTest.java | 347 ------ .../reducereads/ReduceReadsUnitTest.java | 214 ---- .../reducereads/SlidingWindowUnitTest.java | 964 -------------- .../BiasedDownsamplingIntegrationTest.java | 162 --- ...dGenotyperReducedReadsIntegrationTest.java | 87 -- .../AssemblyResultSetUnitTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 22 - .../ReferenceConfidenceModelUnitTest.java | 20 - .../ReadThreadingGraphUnitTest.java | 12 +- .../SequenceForKmersUnitTest.java | 18 +- .../pairhmm/ActiveRegionTestDataSet.java | 4 +- .../gatk/datasources/reads/SAMDataSource.java | 15 +- .../AlleleBiasedDownsamplingUtils.java | 39 +- .../sting/gatk/downsampling/Downsampler.java | 11 - .../gatk/walkers/coverage/CallableLoci.java | 7 +- .../gatk/walkers/coverage/CoverageUtils.java | 6 +- .../sting/utils/DeprecatedToolChecks.java | 1 + .../sting/utils/clipping/ClippingOp.java | 6 - .../genotyper/PerReadAlleleLikelihoodMap.java | 14 +- .../locusiterator/AlignmentStateMachine.java | 9 - .../sting/utils/pileup/PileupElement.java | 39 - .../utils/pileup/ReadBackedPileupImpl.java | 5 +- .../sting/utils/sam/ArtificialSAMUtils.java | 25 - .../sting/utils/sam/GATKSAMRecord.java | 206 +-- .../sting/utils/sam/ReadUtils.java | 12 +- .../reads/SAMDataSourceUnitTest.java | 17 + ...AlleleBiasedDownsamplingUtilsUnitTest.java | 34 +- .../FractionalDownsamplerUnitTest.java | 33 - .../LevelingDownsamplerUnitTest.java | 41 - .../ReservoirDownsamplerUnitTest.java | 43 - .../SimplePositionalDownsamplerUnitTest.java | 44 - .../TAROrderedReadCacheUnitTest.java | 44 - .../coverage/CallableLociIntegrationTest.java | 9 - .../utils/clipping/ReadClipperUnitTest.java | 16 - .../utils/pileup/PileupElementUnitTest.java | 2 - .../utils/sam/GATKSAMRecordUnitTest.java | 156 +-- 77 files changed, 142 insertions(+), 7762 deletions(-) delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java delete mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java delete mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java delete mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java delete mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java delete mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java delete mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java delete mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java delete mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java index 5c48417ac..29cee9e15 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/Coverage.java @@ -57,9 +57,6 @@ import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.Arrays; @@ -98,10 +95,7 @@ public class Coverage extends InfoFieldAnnotation implements StandardAnnotation, return null; for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { - for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { - final GATKSAMRecord read = el.getKey(); - depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); - } + depth += maps.getLikelihoodReadMap().size(); } } else diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 0da865a85..52b09d251 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -60,7 +60,6 @@ import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypeBuilder; @@ -119,7 +118,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for ( final PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); + alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); } // we need to add counts in the correct order @@ -146,8 +145,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa if (! a.isInformative() ) continue; // read is non-informative final GATKSAMRecord read = el.getKey(); final int prevCount = alleleCounts.get(a.getMostLikelyAllele()); - final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - alleleCounts.put(a.getMostLikelyAllele(), prevCount + incCount); + alleleCounts.put(a.getMostLikelyAllele(), prevCount + 1); } final int[] counts = new int[alleleCounts.size()]; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java index 21325e6f1..8e5ca83e0 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerSampleHC.java @@ -54,7 +54,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnota import org.broadinstitute.sting.utils.genotyper.MostLikelyAllele; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.Genotype; import org.broadinstitute.variant.variantcontext.GenotypeBuilder; @@ -109,9 +108,7 @@ public class DepthPerSampleHC extends GenotypeAnnotation { for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue(), alleles); if ( a.isInformative() ) { - final GATKSAMRecord read = el.getKey(); - final int incCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - dp += incCount; + dp++; } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index a04815e62..a90f555a1 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -64,7 +64,6 @@ import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -418,8 +417,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); final GATKSAMRecord read = el.getKey(); - final int representativeCount = read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1; - updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt, representativeCount); + updateTable(myTable, mostLikelyAllele.getAlleleIfInformative(), read, ref, alt); } if ( passesMinimumThreshold(myTable) ) copyToMainTable(myTable, table); @@ -464,7 +462,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) continue; - updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt, p.getRepresentativeCount()); + updateTable(myTable, Allele.create(p.getBase(), false), p.getRead(), ref, alt); } if ( passesMinimumThreshold(myTable) ) copyToMainTable(myTable, table); @@ -487,7 +485,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); } - private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt, final int representativeCount) { + private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final Allele alt) { final boolean matchesRef = allele.equals(ref, true); final boolean matchesAlt = allele.equals(alt, true); @@ -496,21 +494,15 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat final int offset = matchesRef ? 0 : 2; if ( read.isStrandless() ) { - - // ignore strandless reduced reads because they are always on the forward strand! - if ( !read.isReducedRead() ) { - - // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 - // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even - // if the read is only seen once, because it's a merged read or other) - final int toAdd = Math.max(representativeCount / 2, 1); - table[offset] += toAdd; - table[offset + 1] += toAdd; - } + // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1 + // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even + // if the read is only seen once, because it's a merged read or other) + table[offset]++; + table[offset + 1]++; } else { // a normal read with an actual strand final boolean isFW = !read.getReadNegativeStrandFlag(); - table[offset + (isFW ? 0 : 1)] += representativeCount; + table[offset + (isFW ? 0 : 1)]++; } } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index d9bc5966c..44e44c63b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnota import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.variant.vcf.VCFStandardHeaderLines; @@ -87,7 +86,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn for ( final Map.Entry sample : stratifiedContexts.entrySet() ) { final AlignmentContext context = sample.getValue(); for ( final PileupElement p : context.getBasePileup() ) - fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), p.getRepresentativeCount(), qualities); + fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), qualities); } } else if (perReadAlleleLikelihoodMap != null) { @@ -96,7 +95,7 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) { for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() ) - fillMappingQualitiesFromPileup(read.getMappingQuality(), (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1), qualities); + fillMappingQualitiesFromPileup(read.getMappingQuality(), qualities); } } else @@ -106,12 +105,9 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms)); } - private static void fillMappingQualitiesFromPileup(final int mq, final int representativeCount, final List qualities) { + private static void fillMappingQualitiesFromPileup(final int mq, final List qualities) { if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) { - if ( representativeCount == 1 ) - qualities.add(mq); - else - qualities.addAll(Collections.nCopies(representativeCount, mq)); + qualities.add(mq); } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index ab5a40145..13211c44c 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -236,8 +236,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR return !(p.isDeletion() || p.getMappingQual() == 0 || p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE || // need the unBAQed quality score here - p.getRead().isReducedRead() ); + ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here } /** @@ -249,8 +248,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR */ protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) { return !( read.getMappingQuality() == 0 || - read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || - read.isReducedRead() ); + read.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE ); } /** diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index dd57c8ac6..417f3b595 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -56,7 +56,6 @@ import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.variant.variantcontext.VariantContext; import java.util.Arrays; @@ -90,10 +89,9 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for ( final PileupElement p : sample.getValue().getBasePileup() ) { - final int actualSampleDepth = p.getRepresentativeCount(); - depth += actualSampleDepth; + depth++; if ( p.isDeletion() ) - deletions += actualSampleDepth; + deletions++; } } Map map = new HashMap(); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 3882b70fa..3da04ef86 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -61,7 +61,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; @@ -74,7 +73,6 @@ import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; -import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; @@ -216,7 +214,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } initializeRecalibrationEngine(); - RecalUtils.checkForInvalidRecalBams(getToolkit().getSAMFileHeaders(), getToolkit().getArguments().ALLOW_BQSR_ON_REDUCED_BAMS); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; referenceReader = getToolkit().getReferenceDataSource().getReference(); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java deleted file mode 100644 index 28a48c212..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ /dev/null @@ -1,207 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -/** - * An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base. - * - * @author Mauricio Carneiro - * @since 6/15/12 - */ -public class BaseAndQualsCounts extends BaseCounts { - - private long sumInsertionQual_A = 0; - private long sumDeletionQual_A = 0; - private long sumInsertionQual_C = 0; - private long sumDeletionQual_C = 0; - private long sumInsertionQual_G = 0; - private long sumDeletionQual_G = 0; - private long sumInsertionQual_T = 0; - private long sumDeletionQual_T = 0; - private long sumInsertionQual_D = 0; - private long sumDeletionQual_D = 0; - private long sumInsertionQual_I = 0; - private long sumDeletionQual_I = 0; - private long sumInsertionQual_N = 0; - private long sumDeletionQual_N = 0; - - /* - * Increments the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - */ - public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { - incr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); - } - - /* - * Increments the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - * @param isSoftClip true if is soft-clipped - */ - public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { - // if we already have high quality bases, ignore low quality ones - if ( isLowQualBase && !isLowQuality() ) - return; - - // if this is a high quality base then remove any low quality bases and start from scratch - if ( !isLowQualBase && isLowQuality() ) { - if ( totalCount() > 0 ) - clear(); - setLowQuality(false); - } - - final BaseIndex i = BaseIndex.byteToBase(base); - super.incr(i, baseQual, baseMappingQual, isSoftClip); - switch (i) { - case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break; - case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break; - case G: sumInsertionQual_G += insQual; sumDeletionQual_G += delQual; break; - case T: sumInsertionQual_T += insQual; sumDeletionQual_T += delQual; break; - case D: sumInsertionQual_D += insQual; sumDeletionQual_D += delQual; break; - case I: sumInsertionQual_I += insQual; sumDeletionQual_I += delQual; break; - case N: sumInsertionQual_N += insQual; sumDeletionQual_N += delQual; break; - } - } - - /* - * Decrements the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - */ - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { - decr(base, baseQual, insQual, delQual, baseMappingQual, isLowQualBase, false); - } - - /* - * Decrements the count - * - * @param base the base - * @param baseQual the base quality - * @param insQual the insertion quality - * @param delQual the deletion quality - * @param baseMappingQual the mapping quality - * @param isLowQualBase true if the base is low quality - * @param isSoftClip true if is soft-clipped - */ - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase, final boolean isSoftClip) { - // if this is not the right type of base, ignore it - if ( isLowQualBase != isLowQuality() ) - return; - - final BaseIndex i = BaseIndex.byteToBase(base); - super.decr(i, baseQual, baseMappingQual, isSoftClip); - switch (i) { - case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break; - case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break; - case G: sumInsertionQual_G -= insQual; sumDeletionQual_G -= delQual; break; - case T: sumInsertionQual_T -= insQual; sumDeletionQual_T -= delQual; break; - case D: sumInsertionQual_D -= insQual; sumDeletionQual_D -= delQual; break; - case I: sumInsertionQual_I -= insQual; sumDeletionQual_I -= delQual; break; - case N: sumInsertionQual_N -= insQual; sumDeletionQual_N -= delQual; break; - } - } - - public byte averageInsertionQualsOfBase(final BaseIndex base) { - return (byte) (getInsertionQual(base) / countOfBase(base)); - } - - public byte averageDeletionQualsOfBase(final BaseIndex base) { - return (byte) (getDeletionQual(base) / countOfBase(base)); - } - - private long getInsertionQual(final BaseIndex base) { - switch (base) { - case A: return sumInsertionQual_A; - case C: return sumInsertionQual_C; - case G: return sumInsertionQual_G; - case T: return sumInsertionQual_T; - case D: return sumInsertionQual_D; - case I: return sumInsertionQual_I; - case N: return sumInsertionQual_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - private long getDeletionQual(final BaseIndex base) { - switch (base) { - case A: return sumDeletionQual_A; - case C: return sumDeletionQual_C; - case G: return sumDeletionQual_G; - case T: return sumDeletionQual_T; - case D: return sumDeletionQual_D; - case I: return sumDeletionQual_I; - case N: return sumDeletionQual_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - /** - * Clears out all stored data in this object - */ - public void clear() { - super.clear(); - sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0; - sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0; - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java deleted file mode 100644 index e1329db3b..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ /dev/null @@ -1,411 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import it.unimi.dsi.fastutil.ints.IntArrayList; -import org.broadinstitute.sting.utils.MathUtils; - - -/** - * An object to keep track of the number of occurrences of each base and it's quality. - * - * User: depristo - * Date: 4/8/11 - * Time: 2:55 PM - */ - - public class BaseCounts { - public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; - public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - - - private int count_A = 0; // keeps track of the base counts - private int sumQual_A = 0; // keeps track of the quals of each base - private int count_C = 0; - private int sumQual_C = 0; - private int count_G = 0; - private int sumQual_G = 0; - private int count_T = 0; - private int sumQual_T = 0; - private int count_D = 0; - private int sumQual_D = 0; - private int count_I = 0; - private int sumQual_I = 0; - private int count_N = 0; - private int sumQual_N = 0; - private int totalCount = 0; // keeps track of total count since this is requested so often - private int nSoftClippedBases = 0; - private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this - private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise - - - public static BaseCounts createWithCounts(int[] countsACGT) { - BaseCounts baseCounts = new BaseCounts(); - baseCounts.count_A = countsACGT[0]; - baseCounts.count_C = countsACGT[1]; - baseCounts.count_G = countsACGT[2]; - baseCounts.count_T = countsACGT[3]; - baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3]; - return baseCounts; - } - - @Requires("other != null") - public void add(final BaseCounts other) { - this.count_A += other.count_A; - this.count_C += other.count_C; - this.count_G += other.count_G; - this.count_T += other.count_T; - this.count_D += other.count_D; - this.count_I += other.count_I; - this.count_N += other.count_N; - this.totalCount += other.totalCount; - this.nSoftClippedBases = other.nSoftClippedBases; - this.mappingQualities.addAll(other.mappingQualities); - } - - @Requires("other != null") - public void sub(final BaseCounts other) { - this.count_A -= other.count_A; - this.count_C -= other.count_C; - this.count_G -= other.count_G; - this.count_T -= other.count_T; - this.count_D -= other.count_D; - this.count_I -= other.count_I; - this.count_N -= other.count_N; - this.totalCount -= other.totalCount; - this.nSoftClippedBases -= other.nSoftClippedBases; - this.mappingQualities.removeAll(other.mappingQualities); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final byte base) { - add(BaseIndex.byteToBase(base), 1); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { - switch (base) { - case A: ++count_A; sumQual_A += qual; break; - case C: ++count_C; sumQual_C += qual; break; - case G: ++count_G; sumQual_G += qual; break; - case T: ++count_T; sumQual_T += qual; break; - case D: ++count_D; sumQual_D += qual; break; - case I: ++count_I; sumQual_I += qual; break; - case N: ++count_N; sumQual_N += qual; break; - } - ++totalCount; - nSoftClippedBases += isSoftclip ? 1 : 0; - mappingQualities.add(mappingQuality); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final byte base) { - add(BaseIndex.byteToBase(base), -1); - } - - private void add(final BaseIndex base, int amount) { - switch(base) { - case A: count_A += amount; break; - case C: count_C += amount; break; - case G: count_G += amount; break; - case T: count_T += amount; break; - case D: count_D += amount; break; - case I: count_I += amount; break; - case N: count_N += amount; break; - } - totalCount += amount; - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final BaseIndex base, final byte qual, final int mappingQuality, final boolean isSoftclip) { - switch (base) { - case A: --count_A; sumQual_A -= qual; break; - case C: --count_C; sumQual_C -= qual; break; - case G: --count_G; sumQual_G -= qual; break; - case T: --count_T; sumQual_T -= qual; break; - case D: --count_D; sumQual_D -= qual; break; - case I: --count_I; sumQual_I -= qual; break; - case N: --count_N; sumQual_N -= qual; break; - } - --totalCount; - nSoftClippedBases -= isSoftclip ? 1 : 0; - mappingQualities.remove((Integer) mappingQuality); - } - - @Ensures("result >= 0") - public long getSumQuals(final byte base) { - return getSumQuals(BaseIndex.byteToBase(base)); - } - - @Ensures("result >= 0") - public long getSumQuals(final BaseIndex base) { - switch (base) { - case A: return sumQual_A; - case C: return sumQual_C; - case G: return sumQual_G; - case T: return sumQual_T; - case D: return sumQual_D; - case I: return sumQual_I; - case N: return sumQual_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - @Ensures("result >= 0") - public byte averageQuals(final byte base) { - return averageQuals(BaseIndex.byteToBase(base)); - } - - @Ensures("result >= 0") - public byte averageQuals(final BaseIndex base) { - return (byte) (getSumQuals(base) / countOfBase(base)); - } - - @Ensures("result >= 0") - public int countOfBase(final byte base) { - return countOfBase(BaseIndex.byteToBase(base)); - } - - @Ensures("result >= 0") - public int countOfBase(final BaseIndex base) { - switch (base) { - case A: return count_A; - case C: return count_C; - case G: return count_G; - case T: return count_T; - case D: return count_D; - case I: return count_I; - case N: return count_N; - default: throw new IllegalArgumentException(base.name()); - } - } - - @Ensures("result >= 0") - public long sumQualsOfBase(final BaseIndex base) { - return getSumQuals(base); - } - - @Ensures("result >= 0") - public byte averageQualsOfBase(final BaseIndex base) { - return (byte) (sumQualsOfBase(base) / countOfBase(base)); - } - - @Ensures("result >= 0") - public int nSoftclips() { - return nSoftClippedBases; - } - - @Ensures("result >= 0") - public int totalCount() { - return totalCount; - } - - /** - * The RMS of the mapping qualities of all reads that contributed to this object - * - * @return the RMS of the mapping qualities of all reads that contributed to this object - */ - public double getRMS() { - return MathUtils.rms(mappingQualities); - } - - /** - * Given a base , it returns the proportional count of this base compared to all other bases - * - * @param base base - * @return the proportion of this base over all other bases - */ - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(final byte base) { - return baseCountProportion(BaseIndex.byteToBase(base)); - } - - /** - * Given a base , it returns the proportional count of this base compared to all other bases - * - * @param baseIndex base - * @return the proportion of this base over all other bases - */ - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(final BaseIndex baseIndex) { - return (totalCount == 0) ? 0.0 : (double)countOfBase(baseIndex) / (double)totalCount; - } - - @Ensures("result != null") - public String toString() { - StringBuilder b = new StringBuilder(); - for (final BaseIndex i : BaseIndex.values()) { - b.append(i.toString()).append("=").append(countOfBase(i)).append(","); - } - return b.toString(); - } - - public byte baseWithMostCounts() { - return baseIndexWithMostCounts().getByte(); - } - - /** - * @return the base index for which the count is highest, including indel indexes - */ - @Ensures("result != null") - public BaseIndex baseIndexWithMostCounts() { - return baseIndexWithMostCounts(true); - } - - /** - * @return the base index for which the count is highest, excluding indel indexes - */ - @Ensures("result != null") - public BaseIndex baseIndexWithMostCountsWithoutIndels() { - return baseIndexWithMostCounts(false); - } - - /** - * Finds the base index with the most counts - * - * @param allowIndels should we allow base indexes representing indels? - * @return non-null base index - */ - @Ensures("result != null") - protected BaseIndex baseIndexWithMostCounts(final boolean allowIndels) { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - int maxCount = countOfBase(maxI); - - for (final BaseIndex i : BaseIndex.values()) { - if ( !allowIndels && !i.isNucleotide() ) - continue; - - final int myCount = countOfBase(i); - if (myCount > maxCount) { - maxI = i; - maxCount = myCount; - } - } - return maxI; - } - - public byte baseWithMostProbability() { - return baseIndexWithMostProbability().getByte(); - } - - @Ensures("result != null") - public BaseIndex baseIndexWithMostProbability() { - return baseIndexWithMostProbability(true); - } - - @Ensures("result != null") - public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { - return baseIndexWithMostProbability(false); - } - - /** - * Finds the base index with the most probability - * - * @param allowIndels should we allow base indexes representing indels? - * @return non-null base index - */ - @Ensures("result != null") - public BaseIndex baseIndexWithMostProbability(final boolean allowIndels) { - BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - long maxSum = getSumQuals(maxI); - - for (final BaseIndex i : BaseIndex.values()) { - if ( !allowIndels && !i.isNucleotide() ) - continue; - - final long mySum = getSumQuals(i); - if (mySum > maxSum) { - maxI = i; - maxSum = mySum; - } - } - return (maxSum > 0L ? maxI : baseIndexWithMostCounts(allowIndels)); - } - - @Ensures("result >=0") - public int totalCountWithoutIndels() { - return totalCount - countOfBase(BaseIndex.D) - countOfBase(BaseIndex.I); - } - - /** - * Calculates the proportional count of a base compared to all other bases except indels (I and D) - * - * @param base base - * @return the proportion of this base over all other bases except indels - */ - @Requires("base.isNucleotide()") - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportionWithoutIndels(final BaseIndex base) { - final int total = totalCountWithoutIndels(); - return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total; - } - - /** - * @return true if this instance represents low quality bases - */ - public boolean isLowQuality() { return isLowQuality; } - - /** - * Sets the low quality value - * - * @param value true if this instance represents low quality bases false otherwise - */ - public void setLowQuality(final boolean value) { isLowQuality = value; } - - /** - * Clears out all stored data in this object - */ - public void clear() { - count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0; - sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0; - totalCount = 0; - nSoftClippedBases = 0; - mappingQualities.clear(); - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java deleted file mode 100644 index 665e3e7ce..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java +++ /dev/null @@ -1,136 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -/** - * Simple byte / base index conversions - * - * - * @author carneiro - * @since 8/26/11 - */ -public enum BaseIndex { - A ( 'A', 0 ), - C ( 'C', 1 ), - G ( 'G', 2 ), - T ( 'T', 3 ), - D ( 'D', 4 ), - I ( 'I', 5 ), // insertion to the right of the base - N ( 'N', 6 ); - - final byte b; - final int index; - - public byte getByte() { return b; } - - /** - * Ordinal is stored in SyntheticRead rather than enum to save object reference, and store as byte for compactness. - * It is stored as byte, and this method merely eliminates a cast. - */ - public byte getOrdinalByte() { return (byte)ordinal(); } - - private BaseIndex(char base, int index) { - this.b = (byte)base; - this.index = index; - } - - /** - * Converts a byte representation of a base to BaseIndex - * - * @param base the byte representation of the base - * @return the BaseIndex representation of the base; - */ - public static BaseIndex byteToBase(final byte base) { - switch (base) { - case 'A': - case 'a': - return A; - case 'C': - case 'c': - return C; - case 'G': - case 'g': - return G; - case 'T': - case 't': - return T; - case 'D': - case 'd': - case '-': - return D; - case 'I': - case 'i': - return I; - case 'N': - case 'n': - return N; - default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base); - } - } - - /** - * Definition of a nucleotide for the BaseIndex is anything that has been read as a base - * by the machine (A,C,G,T), even if it couldn't tell which base it was, but it knows - * there is a base there (N). - * - * @return whether or not it is a nucleotide, given the definition above - */ - public final boolean isNucleotide() { - return !isIndel(); - } - - /** - * Whether or not this base is an insertion or a deletion - * - * @return true for I or D, false otherwise - */ - public final boolean isIndel() { - return this == D || this == I; - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java deleted file mode 100644 index 22ea78521..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/CompressionStash.java +++ /dev/null @@ -1,107 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet; -import it.unimi.dsi.fastutil.objects.ObjectSortedSet; -import org.broadinstitute.sting.utils.*; - -import java.util.Collection; - - -/** - * A stash of regions that must be kept uncompressed in all samples - * - * In general, these are regions that were kept uncompressed by a tumor sample and we want to force - * all other samples (normals and/or tumors) to also keep these regions uncompressed - * - * User: carneiro - * Date: 10/15/12 - * Time: 4:08 PM - */ -public class CompressionStash extends ObjectAVLTreeSet { - public CompressionStash() { - super(); - } - - /** - * Adds a UnvalidatingGenomeLoc to the stash and merges it with any overlapping (and contiguous) existing loc - * in the stash. - * - * @param insertLoc the new loc to be inserted - * @return true if the loc, or it's merged version, wasn't present in the list before. - */ - @Override - public boolean add(final FinishedGenomeLoc insertLoc) { - ObjectSortedSet removedLocs = new ObjectAVLTreeSet(); - for (FinishedGenomeLoc existingLoc : this) { - if (existingLoc.isPast(insertLoc)) { - break; // if we're past the loc we're done looking for overlaps. - } - if (existingLoc.equals(insertLoc)) { - return false; // if this loc was already present in the stash, we don't need to insert it. - } - if (existingLoc.contiguousP(insertLoc)) { - removedLocs.add(existingLoc); // list the original loc for merging - } - } - - this.removeAll(removedLocs); // remove all locs that will be merged - removedLocs.add(insertLoc); // add the new loc to the list of locs that will be merged - - return super.add(new FinishedGenomeLoc(GenomeLoc.merge(removedLocs), insertLoc.isFinished())); - } - - @Override - public boolean addAll(Collection locs) { - boolean result = false; - for (final FinishedGenomeLoc loc : locs) { - result |= this.add(loc); - } - return result; - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java deleted file mode 100644 index 1c0336ebf..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/Compressor.java +++ /dev/null @@ -1,108 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: depristo - * Date: 4/10/11 - * Time: 8:49 AM - * - * A general interface for ReadCompressors. Read compressors have the following semantics: - * - * The accept a stream of reads, in order, and after each added read returns a compressed stream - * of reads for emission. This stream of reads is a "reduced" representation of the total stream - * of reads. The actual compression approach is left up to the implementing class. - */ -public interface Compressor { - /** - * Adds the read to the compressor. The returned iteratable collection of - * reads represents the incremental compressed output. - * @param read the next uncompressed read in the input stream to the compressor - * @return an iterator over the incrementally available compressed reads - */ - @Requires("read != null") - @Ensures("result != null") - Iterable addAlignment(GATKSAMRecord read); - - /** - * Must be called after the last read has been added to finalize the compressor state - * and return the last compressed reads from the compressor. - * @return an iterator over the final compressed reads of this compressor - */ - @Ensures("result != null") - Iterable close(); -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java deleted file mode 100644 index 13010f905..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/FinishedGenomeLoc.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; - -/** - * GenomeLocs are very useful objects to keep track of genomic locations and perform set operations - * with them. - * - * However, GenomeLocs are bound to strict validation through the GenomeLocParser and cannot - * be created easily for small tasks that do not require the rigors of the GenomeLocParser validation - * - * UnvalidatingGenomeLoc is a simple utility to create GenomeLocs without going through the parser. Should - * only be used outside of the engine. - * - * User: carneiro - * Date: 10/16/12 - * Time: 2:07 PM - */ -public class FinishedGenomeLoc extends UnvalidatingGenomeLoc { - private boolean finished; - - public FinishedGenomeLoc(final String contigName, final int contigIndex, final int start, final int stop, final boolean finished) { - super(contigName, contigIndex, start, stop); - this.finished = finished; - } - - public FinishedGenomeLoc(final GenomeLoc loc, final boolean finished) { - super(loc.getContig(), loc.getContigIndex(), loc.getStart(), loc.getStop()); - this.finished = finished; - } - - public boolean isFinished() { - return finished; - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java deleted file mode 100644 index 5e84076fd..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ /dev/null @@ -1,393 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - - -/** - * The element that describes the header of the sliding window. - * - * Each site has a header element containing the counts of each base, it's reference based location and whether or - * not the site has insertions (to it's right). It also contains information about the bases that have been filtered - * out due to mapping or base quality. - */ -public class HeaderElement { - private BaseAndQualsCounts positiveConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site. - private BaseAndQualsCounts negativeConsensusBaseCounts; // How many A,C,G,T (and D's) are in this site. - private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site. - private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right - private int location; // Genome location of this site (the sliding window knows which contig we're at - - protected static final int MIN_COUNT_FOR_USING_PVALUE = 2; - - public int getLocation() { - return location; - } - - /** - * Get the base counts object for the consensus type - * - * @param consensusType the type to use - * @return non-null base counts - */ - public BaseAndQualsCounts getBaseCounts(final SlidingWindow.ConsensusType consensusType) { - if ( consensusType == SlidingWindow.ConsensusType.POSITIVE_CONSENSUS ) - return positiveConsensusBaseCounts; - if ( consensusType == SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS ) - return negativeConsensusBaseCounts; - return filteredBaseCounts; - } - - /** - * Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty - * filteredBaseCounts - 0 insertions to the right - empty mappingQuality list - * - * @param location the reference location for the new element - */ - public HeaderElement(final int location) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, location); - } - - /** - * Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty - * filteredBaseCounts - empty mappingQuality list - * - * @param location the reference location for the new element - */ - public HeaderElement(final int location, final int insertionsToTheRight) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, location); - } - - /** - * Creates a new HeaderElement with all given parameters - * - * @param positiveConsensusBaseCounts the BaseCounts object for the running positive consensus synthetic read - * @param negativeConsensusBaseCounts the BaseCounts object for the running negative consensus synthetic read - * @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read - * @param insertionsToTheRight number of insertions to the right of this HeaderElement - * @param location the reference location of this reference element - * HeaderElement - */ - public HeaderElement(final BaseAndQualsCounts positiveConsensusBaseCounts, final BaseAndQualsCounts negativeConsensusBaseCounts, final BaseAndQualsCounts filteredBaseCounts, final int insertionsToTheRight, final int location) { - this.positiveConsensusBaseCounts = positiveConsensusBaseCounts; - this.negativeConsensusBaseCounts = negativeConsensusBaseCounts; - this.filteredBaseCounts = filteredBaseCounts; - this.insertionsToTheRight = insertionsToTheRight; - this.location = location; - } - - /** - * Whether or not the site represented by this HeaderElement is variant according to the definitions of variant - * by insertion, deletion and mismatches. - * - * @param minVariantPvalue min p-value for deciding that a position is or is not variable due to mismatches - * @param minVariantProportion min proportion for deciding that a position is or is not variable due to mismatches - * @param minIndelProportion min proportion for deciding that a position is or is not variable due to indels - * @return true if site is variant by any definition. False otherwise. - */ - public boolean isVariant(final double minVariantPvalue, final double minVariantProportion, final double minIndelProportion) { - return ( hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) ) - && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue, minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); - } - - /** - * Adds a new base to the HeaderElement updating all counts accordingly - * - * @param base the base to add - * @param baseQual the base quality - * @param insQual the base insertion quality - * @param delQual the base deletion quality - * @param baseMappingQuality the mapping quality of the read this base belongs to - * @param minBaseQual the minimum base qual allowed to be a good base - * @param minMappingQual the minimum mapping qual allowed to be a good read - * @param isSoftClipped true if the base is soft-clipped in the original read - * @param isNegativeStrand true if the base comes from a read on the negative strand - */ - public void addBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) { - // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts - if ( baseMappingQuality >= minMappingQual ) { - if ( isNegativeStrand ) - negativeConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - else - positiveConsensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - } else { - filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - } - } - - /** - * Adds a new base to the HeaderElement updating all counts accordingly - * - * @param base the base to add - * @param baseQual the base quality - * @param insQual the base insertion quality - * @param delQual the base deletion quality - * @param baseMappingQuality the mapping quality of the read this base belongs to - * @param minBaseQual the minimum base qual allowed to be a good base - * @param minMappingQual the minimum mapping qual allowed to be a good read - * @param isSoftClipped true if the base is soft-clipped in the original read - * @param isNegativeStrand true if the base comes from a read on the negative strand - */ - public void removeBase(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQuality, final int minBaseQual, final int minMappingQual, final boolean isSoftClipped, final boolean isNegativeStrand) { - // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts - if ( baseMappingQuality >= minMappingQual ) { - if ( isNegativeStrand ) - negativeConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - else - positiveConsensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual, isSoftClipped); - } else { - filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - } - } - - /** - * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions - * should be added to the right of the element. - */ - public void addInsertionToTheRight() { - insertionsToTheRight++; - } - - /** - * Does this HeaderElement contain consensus data? - * - * @param consensusType the type to use - * @return whether or not this HeaderElement contains consensus data - */ - public boolean hasConsensusData(final SlidingWindow.ConsensusType consensusType) { - return getBaseCounts(consensusType).totalCount() > 0; - } - - /** - * A HeaderElement is empty if it has no consensus or filtered data - * - * @return whether or not this HeaderElement has no data - */ - public boolean isEmpty() { - return !hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS) && !hasConsensusData(SlidingWindow.ConsensusType.FILTERED); - } - - /** - * removes an insertion from this element (if you removed a read that had an insertion) - */ - public void removeInsertionToTheRight() { - this.insertionsToTheRight--; - if (insertionsToTheRight < 0) - throw new ReviewedStingException("Removed too many insertions, header is now negative at position " + location); - } - - public boolean hasInsertionToTheRight() { - return insertionsToTheRight > 0; - } - - public int numInsertionsToTheRight() { - return insertionsToTheRight; - } - - /** - * Whether or not the HeaderElement is variant due to excess insertions - * - * @return whether or not the HeaderElement is variant due to excess insertions - */ - private boolean isVariantFromInsertions(double minIndelProportion) { - final int numberOfBases = totalCountForBothStrands(); - if (numberOfBases == 0) - return (insertionsToTheRight > 0); // do we only have insertions? - - // if we have bases and insertions, check the ratio - return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion; - } - - private int totalCountForBothStrands() { - return positiveConsensusBaseCounts.totalCount() + negativeConsensusBaseCounts.totalCount(); - } - - /** - * Whether or not the HeaderElement is variant due to excess deletions - * - * @return whether or not the HeaderElement is variant due to excess deletions - */ - private boolean isVariantFromDeletions(double minIndelProportion) { - return positiveConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || positiveConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion - || negativeConsensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || negativeConsensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion; - } - - /** - * Whether or not the HeaderElement is variant due to excess mismatches - * - * @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage). - * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage). - * @return whether or not the HeaderElement is variant due to excess mismatches - */ - protected boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion) { - return isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || - isVariantFromMismatches(minVariantPvalue, minVariantProportion, SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS); - } - - /** - * Whether or not the HeaderElement is variant due to excess mismatches - * - * @param minVariantPvalue the minimum pvalue to call a site variant (used with low coverage). - * @param minVariantProportion the minimum proportion to call a site variant (used with high coverage). - * @param consensusType the consensus type to use - * @return whether or not the HeaderElement is variant due to excess mismatches - */ - private boolean isVariantFromMismatches(final double minVariantPvalue, final double minVariantProportion, final SlidingWindow.ConsensusType consensusType) { - final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType); - final int totalCount = baseAndQualsCounts.totalCountWithoutIndels(); - final BaseIndex mostCommon = baseAndQualsCounts.baseIndexWithMostProbabilityWithoutIndels(); - final int countOfOtherBases = totalCount - baseAndQualsCounts.countOfBase(mostCommon); - return hasSignificantCount(countOfOtherBases, totalCount, minVariantPvalue, minVariantProportion); - } - - /** - * This handles the special case where we have more bases that came from soft clips than bases that came from - * normal bases by forcing it to become a variant region. We don't want a consensus based on too little information. - * - * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. - */ - protected boolean isVariantFromSoftClips() { - return isVariantFromSoftClips(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) || isVariantFromSoftClips(SlidingWindow.ConsensusType.NEGATIVE_CONSENSUS); - } - - /** - * This handles the special case where we have more bases that came from soft clips than bases that came from - * normal bases by forcing it to become a variant region. We don't want a consensus based on too little information. - * - * @param consensusType the consensus type to use - * @return true if we had more soft clipped bases contributing to this site than matches/mismatches. - */ - private boolean isVariantFromSoftClips(final SlidingWindow.ConsensusType consensusType) { - final BaseAndQualsCounts baseAndQualsCounts = getBaseCounts(consensusType); - final int nSoftClippedBases = baseAndQualsCounts.nSoftclips(); - return nSoftClippedBases > 0 && nSoftClippedBases >= (baseAndQualsCounts.totalCount() - nSoftClippedBases); - } - - /** - * Calculates the number of alleles necessary to represent this site. - * - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return the number of alleles necessary to represent this site or -1 if there are too many indels - */ - public int getNumberOfBaseAlleles(final double minVariantPvalue, final double minVariantProportion) { - final ObjectArrayList alleles = getAlleles(minVariantPvalue, minVariantProportion); - return alleles == null ? -1 : alleles.size(); - } - - /** - * Calculates the alleles necessary to represent this site. - * - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return the list of alleles necessary to represent this site or null if there are too many indels - */ - public ObjectArrayList getAlleles(final double minVariantPvalue, final double minVariantProportion) { - // make sure we have bases at all - final int totalBaseCount = totalCountForBothStrands(); - if ( totalBaseCount == 0 ) - return new ObjectArrayList<>(0); - - // next, check for insertions; technically, the insertion count can be greater than totalBaseCount - // (because of the way insertions are counted), so we need to account for that - if ( hasSignificantCount(Math.min(totalBaseCount, insertionsToTheRight), totalBaseCount, minVariantPvalue, minVariantProportion) ) - return null; - - // finally, check for the bases themselves (including deletions) - final ObjectArrayList alleles = new ObjectArrayList<>(4); - for ( final BaseIndex base : BaseIndex.values() ) { - final int baseCount = positiveConsensusBaseCounts.countOfBase(base) + negativeConsensusBaseCounts.countOfBase(base); - if ( baseCount == 0 ) - continue; - - if ( hasSignificantCount(baseCount, totalBaseCount, minVariantPvalue, minVariantProportion) ) { - if ( base == BaseIndex.D ) - return null; - alleles.add(base); - } - } - return alleles; - } - - /* - * Checks whether there are a significant number of softclips. - * - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return true if there are significant softclips, false otherwise - */ - public boolean hasSignificantSoftclips(final double minVariantPvalue, final double minVariantProportion) { - return hasSignificantCount(positiveConsensusBaseCounts.nSoftclips() + negativeConsensusBaseCounts.nSoftclips(), totalCountForBothStrands(), minVariantPvalue, minVariantProportion); - } - - /* - * Checks whether there are a significant number of count. - * - * @param count the count (k) to test against - * @param total the total (n) to test against - * @param minVariantPvalue the minimum pvalue to call a site variant. - * @param minVariantProportion the minimum proportion to call a site variant. - * @return true if there is a significant count given the provided pvalue, false otherwise - */ - private boolean hasSignificantCount(final int count, final int total, final double minVariantPvalue, final double minVariantProportion) { - if ( count == 0 || total == 0 ) - return false; - - // use p-values for low counts of k - if ( count <= MIN_COUNT_FOR_USING_PVALUE ) { - final double pvalue = MathUtils.binomialCumulativeProbability(total, 0, count); - return pvalue > minVariantPvalue; - } - - // otherwise, use straight proportions - final int minBaseCountForSignificance = (int)(minVariantProportion * total); - return count >= minBaseCountForSignificance; - } -} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java deleted file mode 100644 index bdd407fba..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ /dev/null @@ -1,163 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import it.unimi.dsi.fastutil.objects.*; -import net.sf.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * - * @author depristo - */ -public class MultiSampleCompressor { - protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class); - - protected Object2ObjectMap compressorsPerSample = new Object2ObjectOpenHashMap(); - - public MultiSampleCompressor(SAMFileHeader header, - final int contextSize, - final int downsampleCoverage, - final int minMappingQuality, - final double minAltPValueToTriggerVariant, - final double minAltProportionToTriggerVariant, - final double minIndelProportionToTriggerVariant, - final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy) { - for ( String name : SampleUtils.getSAMFileSamples(header) ) { - compressorsPerSample.put(name, - new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); - } - } - - /** - * Add an alignment to the compressor - * - * @param read the read to be added - * @param knownSnpPositions the set of known SNP positions - * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) - */ - public ObjectSet addAlignment(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { - String sampleName = read.getReadGroup().getSample(); - SingleSampleCompressor compressor = compressorsPerSample.get(sampleName); - if ( compressor == null ) - throw new ReviewedStingException("No compressor for sample " + sampleName); - Pair, CompressionStash> readsAndStash = compressor.addAlignment(read, knownSnpPositions); - ObjectSet reads = readsAndStash.getFirst(); - CompressionStash regions = readsAndStash.getSecond(); - - reads.addAll(closeVariantRegionsInAllSamples(regions, knownSnpPositions)); - - return reads; - } - - /** - * Properly closes the compressor. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public ObjectSet close(final ObjectSortedSet knownSnpPositions) { - ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); - for ( SingleSampleCompressor sample : compressorsPerSample.values() ) { - Pair, CompressionStash> readsAndStash = sample.close(knownSnpPositions); - reads.addAll(readsAndStash.getFirst()); - } - return reads; - } - - /** - * Finalizes current variant regions. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - private ObjectSet closeVariantRegionsInAllSamples(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { - ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); - if (!regions.isEmpty()) { - for (SingleSampleCompressor sample : compressorsPerSample.values()) { - reads.addAll(sample.closeVariantRegions(regions, knownSnpPositions)); - } - } - return reads; - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java deleted file mode 100644 index 383ba5ee9..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ /dev/null @@ -1,782 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; -import it.unimi.dsi.fastutil.objects.ObjectAVLTreeSet; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import it.unimi.dsi.fastutil.objects.ObjectSortedSet; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.util.SequenceUtil; -import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.clipping.ReadClipper; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpConstants; -import org.broadinstitute.sting.utils.sam.BySampleSAMFileWriter; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.broadinstitute.variant.variantcontext.VariantContext; - -import java.util.Collections; -import java.util.List; - - -/** - * Reduces the BAM file using read based compression that keeps only essential information for variant calling - * - *

- * This tool will generate reduced versions of the BAM files that still follow the BAM specification - * and contain all the information necessary to call variants according to the GATK Best Practices recommendations. - * Some options allow you to tune how much compression you want to achieve. The default values have been - * shown to reduce a typical whole exome BAM file by 100x. The higher the coverage, the bigger the - * savings in file size and performance of the downstream tools. - * - *

Input

- *

- * The BAM file to be compressed - *

- * - *

Output

- *

- * The compressed (reduced) BAM file. - * - *

- *

Examples

- *
- * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -R ref.fasta \
- *   -T ReduceReads \
- *   -I myData.bam \
- *   -o myData.reduced.bam
- * 
- */ - -@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) -@PartitionBy(PartitionType.CONTIG) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) -@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=40) -public class ReduceReads extends ReadWalker, ReduceReadsStash> { - - @Output(required = false, defaultToStdout = false) - private StingSAMFileWriter out = null; - private SAMFileWriter writerToUse = null; - - /** - * - */ - @Argument(fullName = "context_size", shortName = "cs", doc = "The number of bases to keep around mismatches (potential variation)", required = false) - public int contextSize = 10; - - /** - * Reads that have - * mapping quality below this threshold will not be counted towards consensus, but are still counted - * towards variable regions. - */ - @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "The minimum mapping quality to be considered for the consensus synthetic read", required = false) - public int minMappingQuality = 20; - - /** - * Reads that have - * base quality below this threshold will not be counted towards consensus, but are still counted - * towards variable regions. - */ - @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "The minimum base quality to be considered for the consensus synthetic read", required = false) - public byte minBaseQual = 15; - - /** - * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with - * quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm. - */ - @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) - public byte minTailQuality = 2; - - /** - * Any number of VCF files representing known SNPs to be used for the polyploid-based reduction. - * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored. - * If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present - * in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will - * be triggered anywhere there is a single SNP present in a consensus window. - */ - @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false) - public List> known = Collections.emptyList(); - - /** - * This strips away all extra information of the read -- anything other than bases, quals - * and read group. - */ - @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "Do not simplify read", required = false) - public boolean DONT_SIMPLIFY_READS = false; - - /** - * Note that it is not necessary to turn this on for reads that are not mate paired. - * The program will behave correctly by default in those cases. - */ - @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "Do not hard clip adaptor sequences", required = false) - public boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; - - /** - * This option overrides the argument of minimum tail - * quality. - */ - @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "Do not hard clip the low quality tails of the reads", required = false) - public boolean DONT_CLIP_LOW_QUAL_TAILS = false; - - /** - * By default, ReduceReads will hard clip away any low quality soft clipped - * base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant - * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) - */ - @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "Do not use high quality soft-clipped bases", required = false) - public boolean DONT_USE_SOFTCLIPPED_BASES = false; - - /** - * By default, ReduceReads will compress read names to numbers and guarantee - * uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather - * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. - */ - @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "Do not compress read names", required = false) - public boolean DONT_COMPRESS_READ_NAMES = false; - - /** - * The hard clips will happen exactly at the interval border. - */ - @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "Hard clip all incoming reads to the desired intervals", required = false) - public boolean HARD_CLIP_TO_INTERVAL = false; - - /** - * Anything below this will be - * considered consensus and reduced (otherwise we will try to trigger polyploid compression). Note that - * this value is used only regions with high coverage. - */ - @Advanced - @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "Minimum proportion of mismatches in a site to trigger a variant region", required = false) - public double minAltProportionToTriggerVariant = 0.05; - - /** - * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to - * trigger polyploid compression). Note that this value is used only regions with low coverage. - */ - @Advanced - @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region", required = false) - public double minAltPValueToTriggerVariant = 0.01; - - /** - * Anything below this will be considered consensus. - */ - @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "Minimum proportion of indels in a site to trigger a variant region", required = false) - public double minIndelProportionToTriggerVariant = 0.05; - - /** - * This level of downsampling only happens after the region has been evaluated, therefore it can - * be combined with the engine level downsampling. - * A value of 0 turns downsampling off. - */ - @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "Downsample the number of reads emitted per sample in a variant region for better compression", required = false) - public int downsampleCoverage = 250; - - /** - * Generally, this tool is not meant to be run for more than 1 sample at a time. The one valid exception - * brought to our attention by colleagues is the specific case of tumor/normal pairs in cancer analysis. - * To prevent users from unintentionally running the tool in a less than ideal manner, we require them - * to explicitly enable multi-sample analysis with this argument. - */ - @Argument(fullName = "cancer_mode", shortName = "cancer_mode", doc = "Enable multi-sample reduction for cancer analysis", required = false) - public boolean ALLOW_MULTIPLE_SAMPLES = false; - - @Hidden - @Argument(fullName = "nwayout", shortName = "nw", doc = "Generate separate output files per input file", required = false) - public boolean nwayout = false; - - @Hidden - @Argument(fullName = "", shortName = "dl", doc = "Debug level", required = false) - public int debugLevel = 0; - - @Hidden - @Argument(fullName = "", shortName = "dr", doc = "Debug read", required = false) - public String debugRead = ""; - - @Hidden - @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "Downsampling strategy", required = false) - public DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; - - @Hidden - @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="Discard program tags", required = false) - public boolean NO_PG_TAG = false; - - public enum DownsampleStrategy { - Normal, - Adaptive - } - - int nCompressedReads = 0; - - private static int READ_NAME_HASH_DEFAULT_SIZE = 1000; - Long nextReadNumber = 1L; // The next number to use for the compressed read name. - Object2LongOpenHashMap readNameHash; // This hash will keep the name of the original read the new compressed name (a number). - - ObjectSortedSet intervalList; - - ObjectSortedSet knownSnpPositions; - - // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER - public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag - private static final String PROGRAM_FILENAME_EXTENSION = ".reduced.bam"; - - /** - * Basic generic initialization of the readNameHash and the intervalList. Output initialization - * is done at the reduceInit method - */ - @Override - public void initialize() { - super.initialize(); - - if ( !nwayout && out == null ) - throw new UserException.MissingArgument("out", "the output must be provided and is optional only for certain debugging modes"); - - if ( nwayout && out != null ) - throw new UserException.CommandLineException("--out and --nwayout cannot be used simultaneously; please use one or the other"); - - if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 ) - throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); - - if ( minAltProportionToTriggerVariant < 0.0 || minAltProportionToTriggerVariant > 1.0 ) - throw new UserException.BadArgumentValue("--minimum_alt_proportion_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); - - if ( SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()).size() > 1 && !ALLOW_MULTIPLE_SAMPLES ) - throw new UserException.BadInput("Reduce Reads is not meant to be run for more than 1 sample at a time except for the specific case of tumor/normal pairs in cancer analysis. If that is what you want to do, use the -cancer_mode flag."); - - if ( known.isEmpty() ) - knownSnpPositions = null; - else - knownSnpPositions = new ObjectAVLTreeSet(); - - GenomeAnalysisEngine toolkit = getToolkit(); - this.resetReadNameHash(); // prepare the read name hash to keep track of what reads have had their read names compressed - intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode - - if (toolkit.getIntervals() != null) - intervalList.addAll(toolkit.getIntervals()); - - final boolean indexOnTheFly = true; - final SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.coordinate; - if (nwayout) { - SAMProgramRecord programRecord = NO_PG_TAG ? null : Utils.createProgramRecord(toolkit, this, PROGRAM_RECORD_NAME); - writerToUse = new BySampleSAMFileWriter(toolkit, PROGRAM_FILENAME_EXTENSION, sortOrder, false, indexOnTheFly, NO_PG_TAG, programRecord, true); - } - else { - writerToUse = out; - out.setPresorted(false); - if (!NO_PG_TAG) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME); - } - } - } - - /** Initializer for {@link #readNameHash}. */ - private void resetReadNameHash() { - // If the hash grows large, subsequent clear operations can be very expensive, so trim the hash down if it grows beyond its default. - if (readNameHash == null || readNameHash.size() > READ_NAME_HASH_DEFAULT_SIZE) { - readNameHash = new Object2LongOpenHashMap(READ_NAME_HASH_DEFAULT_SIZE); - } else { - readNameHash.clear(); - } - } - - /** - * Takes in a read and prepares it for the SlidingWindow machinery by performing the - * following optional clipping operations: - * 1. Hard clip adaptor sequences - * 2. Hard clip low quality tails - * 3. Hard clip all remaining soft clipped bases - * 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads) - * - * @param ref default map parameter - * @param read default map parameter - * @param metaDataTracker default map parameter - * @return a linked list with all the reads produced by the clipping operations - */ - @Override - public ObjectArrayList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { - ObjectArrayList mappedReads; - if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) - System.out.println("Found debug read!"); - - if (debugLevel == 1) - System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd()); - - // we write the actual alignment starts to their respective alignment shift tags in the temporary - // attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file - read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart()); - read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd()); - - // Check if the read goes beyond the boundaries of the chromosome, and hard clip those boundaries. - int chromosomeLength = ref.getGenomeLocParser().getContigInfo(read.getReferenceName()).getSequenceLength(); - if (read.getSoftStart() < 0) - read = ReadClipper.hardClipByReadCoordinates(read, 0, -read.getSoftStart()); - if (read.getSoftEnd() > chromosomeLength) - read = ReadClipper.hardClipByReadCoordinates(read, chromosomeLength - read.getSoftStart() + 1, read.getReadLength() - 1); - - if (!DONT_SIMPLIFY_READS) - read.simplify(); // Clear all unnecessary attributes - if (!DONT_CLIP_ADAPTOR_SEQUENCES) - read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any. - if (!DONT_CLIP_LOW_QUAL_TAILS) - read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails - if (!isWholeGenome()) { - if (HARD_CLIP_TO_INTERVAL) - mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval - else { - mappedReads = new ObjectArrayList(); - mappedReads.add(read); - } - } - else { - mappedReads = new ObjectArrayList(); - if (!read.isEmpty()) - mappedReads.add(read); - } - - if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) { - ObjectArrayList tempList = new ObjectArrayList(); - for (GATKSAMRecord mRead : mappedReads) { - GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual); - if (!clippedRead.isEmpty()) - tempList.add(clippedRead); - } - mappedReads = tempList; - } - - if (debugLevel == 1) - for (GATKSAMRecord mappedRead : mappedReads) - System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd()); - - // add the SNPs to the list of known positions - populateKnownSNPs(metaDataTracker); - - return mappedReads; - } - - /* - * Add the positions of known SNPs to the set so that we can keep track of it - * - * @param metaDataTracker the ref meta data tracker - */ - protected void populateKnownSNPs(final RefMetaDataTracker metaDataTracker) { - for ( final VariantContext vc : metaDataTracker.getValues(known) ) { - if ( vc.isSNP() ) - knownSnpPositions.add(getToolkit().getGenomeLocParser().createGenomeLoc(vc)); - } - } - - /** - * Initializes the ReduceReadsStash that keeps track of all reads that are waiting to - * enter the SlidingWindow machinery. The stash makes sure reads are served in order - * even though map() may generate reads that are only supposed to enter the machinery - * in the future. - * - * @return the empty stash - */ - @Override - public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); - } - - /** - * Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process - * all reads that come before the original read (the read that was passed to map) including the original - * read. This is where we send reads, in order, to the SlidingWindow machinery. - * - * @param mappedReads the list of reads sent by map - * @param stash the stash that keeps the reads in order for processing - * @return the stash with all reads that have not been processed yet - */ - public ReduceReadsStash reduce(ObjectArrayList mappedReads, ReduceReadsStash stash) { - if (debugLevel == 1) - stash.print(); - - boolean firstRead = true; - for (GATKSAMRecord read : mappedReads) { - boolean originalRead = firstRead && isOriginalRead(mappedReads, read); - - if (read.getReadLength() == 0) - throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd()); - - if (originalRead) { - ObjectArrayList readsReady = new ObjectArrayList(); - readsReady.addAll(stash.getAllReadsBefore(read)); - readsReady.add(read); - - for (GATKSAMRecord readReady : readsReady) { - if (debugLevel == 1) - System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd()); - - for (GATKSAMRecord compressedRead : stash.compress(readReady, knownSnpPositions)) - outputRead(compressedRead); - - // We only care about maintaining the link between read pairs if they are in the same variant - // region. Since an entire variant region's worth of reads is returned in a single call to - // stash.compress(), the readNameHash can be cleared after the for() loop above. - // The advantage of clearing the hash is that otherwise it holds all reads that have been encountered, - // which can use a lot of memory and cause RR to slow to a crawl and/or run out of memory. - this.resetReadNameHash(); - - } - } else - stash.add(read); - - firstRead = false; - } - - // reduce memory requirements by removing old positions - if ( !mappedReads.isEmpty() ) - clearStaleKnownPositions(mappedReads.get(0)); - - return stash; - } - - /** - * Now that now more reads will come, we process all the remaining reads in the stash, in order. - * - * @param stash the ReduceReadsStash with all unprocessed reads (from reduce) - */ - @Override - public void onTraversalDone(ReduceReadsStash stash) { - - // output any remaining reads in the compressor - for (GATKSAMRecord read : stash.close(knownSnpPositions)) - outputRead(read); - - if (nwayout) - writerToUse.close(); - } - - /** - * Removes known positions that are no longer relevant for use with het compression. - * - * @param read the current read, used for checking whether there are stale positions we can remove - */ - protected void clearStaleKnownPositions(final GATKSAMRecord read) { - // nothing to clear if not used or empty - if ( knownSnpPositions == null || knownSnpPositions.isEmpty() ) - return; - - // not ready to be cleared until we encounter a read from a different contig - final int contigIndexOfRead = read.getReferenceIndex(); - if ( knownSnpPositions.first().getContigIndex() == contigIndexOfRead ) - return; - - // because we expect most elements to be stale, it's not going to be efficient to remove them one at a time - final ObjectAVLTreeSet goodLocs = new ObjectAVLTreeSet(); - for ( final GenomeLoc loc : knownSnpPositions ) { - if ( loc.getContigIndex() == contigIndexOfRead ) - goodLocs.add(loc); - } - knownSnpPositions.clear(); - knownSnpPositions.addAll(goodLocs); - } - - /** - * Hard clips away all parts of the read that doesn't agree with the intervals selected. - * - * Note: If read overlaps more than one interval, it will be hard clipped to all - * the intervals it overlaps with - * - * @param read the read to be hard clipped to the interval. - * @return a shallow copy of the read hard clipped to the interval - */ - private ObjectArrayList hardClipReadToInterval(GATKSAMRecord read) { - ObjectArrayList clippedReads = new ObjectArrayList(); - - GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list) - - boolean originalRead = true; // false if this is the right tail of the original read - boolean overlap; // keeps track of the interval that overlapped the original read - boolean doneClipping; // triggers an early exit if we are done clipping this read - - if (isWholeGenome()) - clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged - - for (GenomeLoc interval : intervalList) { - - if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before) - break; - - GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch - - switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) { - case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read - if (!originalRead) // something went wrong if this is the tail of the read - throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - overlap = false; - doneClipping = false; - break; - - - case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore - if (originalRead) { - overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read). - clippedRead = GATKSAMRecord.emptyRead(read); - } else - overlap = false; - - doneClipping = false; - break; - - case NO_OVERLAP_CONTIG: // read is in a different contig - if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one. - if (read.getReferenceIndex() < interval.getContigIndex()) - throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - else { - overlap = false; - doneClipping = false; - } - } // tail read CANNOT be in a different contig. - else { - if (read.getReferenceIndex() < interval.getContigIndex()) { - overlap = false; - doneClipping = true; - } else - throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - - } - break; - - case NO_OVERLAP_LEFT: - if (originalRead) // if this is the first read this should never happen. - throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString()); - - overlap = false; - doneClipping = true; - break; - - case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore - overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here. - doneClipping = true; - break; - - case OVERLAP_LEFT: // clip the left tail of the read - clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1); - - overlap = true; - doneClipping = true; - break; - - case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval - clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1); - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop()); - - overlap = true; - doneClipping = false; - break; - - case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read - clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1); - read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop()); - - overlap = true; - doneClipping = false; - break; - - case OVERLAP_CONTAINED: // don't do anything to the read - clippedRead = read; - - overlap = true; - doneClipping = true; - break; - - default: - throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval."); - } - - if (overlap && originalRead) - intervalOverlapped = interval; - - if (clippedRead != null) { - originalRead = false; - - if (!clippedRead.isEmpty()) - clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off - } - - if (doneClipping) - break; - } - - if (intervalOverlapped != null) - intervalList = intervalList.tailSet(intervalOverlapped); - - return clippedReads; - } - - /** - * Compresses the read name and adds it to output BAM file (reduced BAM) - * after performing some quality control - * - * @param read any read - */ - private void outputRead(GATKSAMRecord read) { - if (debugLevel == 2) { - checkForHighMismatch(read); - checkCigar(read); - } - - if (read.isReducedRead()) - nCompressedReads++; - else { - int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT); - int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT); - - int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression - int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression - - if (startShift > 0) - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start) - if (endShift > 0) - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end) - } - - if (debugLevel == 1) - System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd()); - - if (!DONT_COMPRESS_READ_NAMES) - nextReadNumber = compressReadName(readNameHash, read, nextReadNumber); - - writerToUse.addAlignment(read); - } - - /** - * Quality control procedure that checks if the consensus reads contains too many - * mismatches with the reference. This should never happen and is a good trigger for - * errors with the algorithm. - * - * @param read any read - */ - private void checkForHighMismatch(GATKSAMRecord read) { - final int start = read.getAlignmentStart(); - final int stop = read.getAlignmentEnd(); - final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases(); - final int nm = SequenceUtil.countMismatches(read, ref, start - 1); - final int readLen = read.getReadLength(); - final double nmFraction = nm / (1.0 * readLen); - if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus")) - throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd()); - } - - private void checkCigar (GATKSAMRecord read) { - if (read.getCigar().isValid(null, -1) != null) { - throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString()); - } - - } - - - /** - * Compresses the read name using the readNameHash if we have already compressed - * this read name before. - * - * @param hash the hash table containing the read name to compressed read name map - * @param read any read - * @param nextReadNumber the number to use in the compressed read name in case this is a new read name - * @return the next number to use in the compressed read name - */ - protected static long compressReadName(final Object2LongOpenHashMap hash, final GATKSAMRecord read, final long nextReadNumber) { - final String name = read.getReadName(); - final StringBuilder compressedName = new StringBuilder(); - long result = nextReadNumber; - if (read.isReducedRead()) { - compressedName.append("C"); - } - final Long readNumber = hash.get(name); - if (readNumber != null) { - compressedName.append(readNumber); - } else { - hash.put(name, nextReadNumber); - compressedName.append(nextReadNumber); - result++; - } - read.setReadName(compressedName.toString()); - return result; - } - - /** - * Returns true if the read is the original read that went through map(). - * - * This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled. - * - * @param list the list - * @param read the read - * @return Returns true if the read is the original read that went through map(). - */ - private boolean isOriginalRead(ObjectArrayList list, GATKSAMRecord read) { - return isWholeGenome() || list.get(0).equals(read); - } - - /** - * Checks whether or not the intervalList is empty, meaning we're running in WGS mode. - * - * @return whether or not we're running in WGS mode. - */ - private boolean isWholeGenome() { - return intervalList.isEmpty(); - } - -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java deleted file mode 100644 index 52c5f0903..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsStash.java +++ /dev/null @@ -1,160 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.ObjectSortedSet; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.LinkedList; -import java.util.List; -import java.util.SortedSet; -import java.util.TreeSet; - -/** - * This class implements a "read stash" that keeps reads always sorted in alignment order. Useful - * for read walkers that alter the alignment information of the incoming reads, but need to - * maintain the reads sorted for the reduce step. (e.g. ReduceReads) - */ - -public class ReduceReadsStash { - protected MultiSampleCompressor compressor; - SortedSet outOfOrderReads; - - /** - * Creates a stash with the default sorting order (read alignment) - * @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close()) - */ - public ReduceReadsStash(MultiSampleCompressor compressor) { - this.compressor = compressor; - this.outOfOrderReads = new TreeSet(new AlignmentStartWithNoTiesComparator()); - } - - /** - * Get all reads before a given read (for processing) - * - * @param read the original read - * @return all reads that have alignment start before the original read. - */ - public List getAllReadsBefore(GATKSAMRecord read) { - List result = new LinkedList(); - GATKSAMRecord newHead = null; - - for (GATKSAMRecord stashedRead : outOfOrderReads) { - if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0) - result.add(stashedRead); - else { - newHead = stashedRead; - break; - } - } - - if (result.size() > 0) { - if (result.size() == outOfOrderReads.size()) - outOfOrderReads.clear(); - else - outOfOrderReads = new TreeSet(outOfOrderReads.tailSet(newHead)); - } - - return result; - } - - /** - * sends the read to the MultiSampleCompressor - * - * @param read the read to be compressed - * @param knownSnpPositions the set of known SNP positions - * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) - */ - public Iterable compress(final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions) { - return compressor.addAlignment(read, knownSnpPositions); - } - - /** - * Add a read to the stash - * - * @param read any read - */ - public void add(GATKSAMRecord read) { - outOfOrderReads.add(read); - } - - /** - * Close the stash, processing all remaining reads in order - * - * @param knownSnpPositions the set of known SNP positions - * @return a list of all the reads produced by the SlidingWindow machinery) - */ - public Iterable close(final ObjectSortedSet knownSnpPositions) { - LinkedList result = new LinkedList(); - - // compress all the stashed reads (in order) - for (GATKSAMRecord read : outOfOrderReads) - for (GATKSAMRecord compressedRead : compressor.addAlignment(read, knownSnpPositions)) - result.add(compressedRead); - - // output any remaining reads from the compressor - for (GATKSAMRecord read : compressor.close(knownSnpPositions)) - result.add(read); - - return result; - } - - /** - * Useful debug functionality, outputs all elements in the stash - */ - public void print() { - int i = 1; - System.out.println("Stash Contents:"); - for (GATKSAMRecord read : outOfOrderReads) - System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd())); - System.out.println(); - } - -} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java deleted file mode 100644 index 61c34b6a0..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ /dev/null @@ -1,153 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import it.unimi.dsi.fastutil.objects.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -/** - * - * @author carneiro, depristo - * @version 3.0 - */ -public class SingleSampleCompressor { - final private int contextSize; - final private int downsampleCoverage; - final private int minMappingQuality; - final private double minAltPValueToTriggerVariant; - final private double minAltProportionToTriggerVariant; - final private double minIndelProportionToTriggerVariant; - final private int minBaseQual; - final private ReduceReads.DownsampleStrategy downsampleStrategy; - - private SlidingWindow slidingWindow; - private int slidingWindowCounter; - - public static Pair, CompressionStash> emptyPair = new Pair,CompressionStash>(new ObjectAVLTreeSet(), new CompressionStash()); - - public SingleSampleCompressor(final int contextSize, - final int downsampleCoverage, - final int minMappingQuality, - final double minAltPValueToTriggerVariant, - final double minAltProportionToTriggerVariant, - final double minIndelProportionToTriggerVariant, - final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy) { - this.contextSize = contextSize; - this.downsampleCoverage = downsampleCoverage; - this.minMappingQuality = minMappingQuality; - this.slidingWindowCounter = 0; - this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant; - this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant; - this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; - this.minBaseQual = minBaseQual; - this.downsampleStrategy = downsampleStrategy; - } - - /** - * Add an alignment to the compressor - * - * @param read the read to be added - * @param knownSnpPositions the set of known SNP positions - * @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window) - */ - public Pair, CompressionStash> addAlignment( final GATKSAMRecord read, final ObjectSortedSet knownSnpPositions ) { - ObjectSet reads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); - CompressionStash stash = new CompressionStash(); - int readOriginalStart = read.getUnclippedStart(); - - // create a new window if: - if ((slidingWindow != null) && - ( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig - (readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window - - // close the current sliding window - Pair, CompressionStash> readsAndStash = slidingWindow.close(knownSnpPositions); - reads = readsAndStash.getFirst(); - stash = readsAndStash.getSecond(); - slidingWindow = null; // so we create a new one on the next if - } - - if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), - slidingWindowCounter, minAltPValueToTriggerVariant, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, - minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); - slidingWindowCounter++; - } - - stash.addAll(slidingWindow.addRead(read)); - return new Pair, CompressionStash>(reads, stash); - } - - /** - * Properly closes the compressor. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { - return (slidingWindow != null) ? slidingWindow.close(knownSnpPositions) : emptyPair; - } - - /** - * Finalizes current variant regions. - * - * @param knownSnpPositions the set of known SNP positions - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { - return slidingWindow == null ? ObjectSets.EMPTY_SET : slidingWindow.closeVariantRegions(regions, knownSnpPositions); - } - -} - diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java deleted file mode 100644 index d5aa8f944..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ /dev/null @@ -1,1110 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import it.unimi.dsi.fastutil.bytes.Byte2IntArrayMap; -import it.unimi.dsi.fastutil.bytes.Byte2IntMap; -import it.unimi.dsi.fastutil.objects.*; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.util.*; - - -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 8/3/11 - * Time: 2:24 PM - */ -public class SlidingWindow { - - // Sliding Window data - final protected PriorityQueue readsInWindow; - final protected LinkedList windowHeader; - protected int contextSize; // the largest context size (between mismatches and indels) - protected String contig; - protected int contigIndex; - protected SAMFileHeader samHeader; - protected GATKSAMReadGroupRecord readGroupAttribute; - protected int downsampleCoverage; - - // Running consensus data - protected int consensusCounter; - protected String consensusReadName; - - // Filtered Data Consensus data - protected int filteredDataConsensusCounter; - protected String filteredDataReadName; - - // Additional parameters - protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT; // pvalue has to be greater than this value to trigger variant region due to mismatches - protected double MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches - protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions - protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value - protected int MIN_MAPPING_QUALITY; - - protected ReduceReads.DownsampleStrategy downsampleStrategy; - private boolean hasIndelQualities; - - private static CompressionStash emptyRegions = new CompressionStash(); - - /** - * The types of synthetic reads - */ - protected enum ConsensusType { - POSITIVE_CONSENSUS, - NEGATIVE_CONSENSUS, - FILTERED - } - - public int getStopLocation() { - return getStopLocation(windowHeader); - } - - private int getStopLocation(final LinkedList header) { - return header.isEmpty() ? -1 : header.peekLast().getLocation(); - } - - public String getContig() { - return contig; - } - - public int getContigIndex() { - return contigIndex; - } - - public int getStartLocation(final LinkedList header) { - return header.isEmpty() ? -1 : header.peek().getLocation(); - } - - // for testing only - protected SlidingWindow(final String contig, final int contigIndex, final int startLocation) { - this.contig = contig; - this.contigIndex = contigIndex; - - contextSize = 10; - - this.windowHeader = new LinkedList<>(); - windowHeader.addFirst(new HeaderElement(startLocation)); - this.readsInWindow = new PriorityQueue<>(100, new Comparator() { - @Override - public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { - return read1.getSoftEnd() - read2.getSoftEnd(); - } - }); - } - - public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader, - final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber, - final double minAltPValueToTriggerVariant, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, - final int minBaseQual, final int minMappingQuality, final int downsampleCoverage, - final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) { - this.contextSize = contextSize; - this.downsampleCoverage = downsampleCoverage; - - this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant; - this.MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant; - this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant; - this.MIN_BASE_QUAL_TO_COUNT = minBaseQual; - this.MIN_MAPPING_QUALITY = minMappingQuality; - - this.windowHeader = new LinkedList<>(); - this.readsInWindow = new PriorityQueue<>(1000, new Comparator() { - @Override - public int compare(GATKSAMRecord read1, GATKSAMRecord read2) { - return read1.getSoftEnd() - read2.getSoftEnd(); - } - }); - - this.contig = contig; - this.contigIndex = contigIndex; - this.samHeader = samHeader; - this.readGroupAttribute = readGroupAttribute; - - this.consensusCounter = 0; - this.consensusReadName = "Consensus-" + windowNumber + "-"; - - this.filteredDataConsensusCounter = 0; - this.filteredDataReadName = "Filtered-" + windowNumber + "-"; - - this.downsampleStrategy = downsampleStrategy; - this.hasIndelQualities = hasIndelQualities; - } - - /** - * Add a read to the sliding window and slides the window accordingly. - * - * Reads are assumed to be in order, therefore, when a read is added the sliding window can - * assume that no more reads will affect read.getUnclippedStart() - contextSizeMismatches. The window - * slides forward to that position and returns all reads that may have been finalized in the - * sliding process. - * - * @param read the read - * @return a non-null list of reads (in the CompressionStash) that have been finished by sliding the window. - */ - @Requires({"read != null"}) - @Ensures("result != null") - public CompressionStash addRead(GATKSAMRecord read) { - addToHeader(windowHeader, read); // update the window header counts - // no need to track low mapping quality reads - if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) - readsInWindow.add(read); // add read to sliding reads - return slideWindow(read.getUnclippedStart()); - } - - /** - * Returns the next complete (or incomplete if closeLastRegion is true) variant region between 'from' (inclusive) and 'to' (exclusive) - * but converted to global coordinates. - * - * @param from beginning window header index of the search window (inclusive) in local (to the windowHeader) coordinates - * @param to end window header index of the search window (exclusive) in local (to the windowHeader) coordinates - * @param variantSite boolean array with true marking variant regions - * @param closeLastRegion if the last index is variant (so it's an incomplete region), should we close (and return as an interval) the location or ignore it? - * @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region. All coordinates returned are global. - */ - @Requires({"from >= 0", "from <= to", "to <= variantSite.length"}) - private FinishedGenomeLoc findNextVariantRegion(int from, int to, boolean[] variantSite, boolean closeLastRegion) { - boolean foundStart = false; - final int windowHeaderStart = getStartLocation(windowHeader); - int variantRegionStartIndex = 0; - for (int i=from; i= 0", "from <= to", "to <= variantSite.length"}) - @Ensures("result != null") - protected CompressionStash findVariantRegions(int from, int to, boolean[] variantSite, boolean closeLastRegion) { - final int windowHeaderStart = getStartLocation(windowHeader); - - CompressionStash regions = new CompressionStash(); - int index = from; - while(index < to) { - // returns results in global coordinates - FinishedGenomeLoc result = findNextVariantRegion(index, to, variantSite, closeLastRegion); - if (result == null) - break; - - regions.add(result); - if (!result.isFinished()) - break; - - index = result.getStop() - windowHeaderStart + 1; // go back to local coordinates - } - return regions; - } - - /** - * Determines if the window can be slid given the new incoming read. - * - * We check from the start of the window to the (unclipped) start of the new incoming read if there - * is any variant. - * If there are variant sites, we check if it's time to close the variant region. - * - * @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start! - * @return all reads that have fallen to the left of the sliding window after the slide - */ - protected CompressionStash slideWindow(final int incomingReadUnclippedStart) { - final int windowHeaderStartLocation = getStartLocation(windowHeader); - CompressionStash regions = emptyRegions; - boolean forceClose = true; - - if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) { - markSites(incomingReadUnclippedStart); - int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation; - int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) - - regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); - } - - while (!readsInWindow.isEmpty() && readsInWindow.peek().getSoftEnd() < windowHeaderStartLocation) { - readsInWindow.poll(); - } - - return regions; - } - - - protected final class MarkedSites { - - private boolean[] siteIsVariant = new boolean[0]; - private int startLocation = 0; - - public MarkedSites() {} - - public boolean[] getVariantSiteBitSet() { return siteIsVariant; } - - protected int getStartLocation() { return startLocation; } - - /** - * Updates the variant site bitset given the new startlocation and size of the region to mark. - * - * @param newStartLocation the new start location of the bitset - * @param sizeOfRegion the new size of the region to be represented - * - * @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder. - */ - public int updateRegion(final int newStartLocation, final int sizeOfRegion) { - int lastPositionMarked = sizeOfRegion; - - // if this is the first time we set the array and we can't reuse anything, just create a new array from scratch - if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) { - siteIsVariant = new boolean[sizeOfRegion]; - lastPositionMarked = 0; - } - // if the dimensions change, copy what we can and continue - else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) { - final boolean[] tempArray = new boolean[sizeOfRegion]; - final int differenceInStartPositions = newStartLocation - this.startLocation; - lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion); - System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked); - siteIsVariant = null; // explicitly allow garbage collection - siteIsVariant = tempArray; - } - - this.startLocation = newStartLocation; - - return lastPositionMarked + newStartLocation; - } - } - - private final MarkedSites markedSites = new MarkedSites(); - - /** - * returns the MarkedSites object so that it can be tested after adding data to the Sliding Window - * - * @return the Marked Sites object used by this Sliding Window - */ - protected MarkedSites getMarkedSitesForTesting() { return markedSites; } - - /** - * returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks) - * - * @param stop check the window from start to stop (not-inclusive); given in global coordinates - */ - protected void markSites(final int stop) { - - final int windowHeaderStartLocation = getStartLocation(windowHeader); - final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1; - - // copy over as many bits as we can from the previous calculation. Note that we can't trust the - // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there. - final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1; - final int locationToProcess = Math.max(windowHeaderStartLocation, Math.min(lastPositionMarked, stop - contextSize)); - - final ListIterator headerElementIterator = windowHeader.listIterator(locationToProcess - windowHeaderStartLocation); - - // process a contextSize worth of region from scratch in case there's a variant there - for (int i = locationToProcess; i < stop; i++) { - if (headerElementIterator.hasNext()) { - HeaderElement headerElement = headerElementIterator.next(); - - if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) - markVariantRegion(i - windowHeaderStartLocation); - - } else - break; - } - } - - /** - * Marks the sites around the variant site (as true) - * - * @param variantSiteLocation the location where a variant site was found - */ - protected void markVariantRegion(final int variantSiteLocation) { - int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize; - int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length - 1 : variantSiteLocation + contextSize; - markRegionAs(from, to, true); - } - - /** - * Marks the sites around the variant site (as true) - * - * @param from the start index (inclusive) to mark - * @param to the end index (inclusive) to mark - * @param isVariant mark the region with this boolean value - */ - private void markRegionAs(final int from, final int to, final boolean isVariant) { - for (int i = from; i <= to; i++) - markedSites.getVariantSiteBitSet()[i] = isVariant; - } - - /** - * Adds bases to the running consensus - * - * If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus - * - * @param header the header to use - * @param start the first header index to add to consensus - * @param end the first header index NOT TO add to consensus - * @param consensusType the consensus type to use - * @return a non-null list of consensus reads generated by this call. Empty list if no consensus was generated. - */ - @Requires({"start >= 0 && (end >= start || end == 0)"}) - @Ensures("result != null") - protected ObjectArrayList addToSyntheticReads(final LinkedList header, final int start, final int end, final ConsensusType consensusType) { - final ObjectArrayList reads = new ObjectArrayList<>(); - - SyntheticRead consensus = null; - final ListIterator headerElementIterator = header.listIterator(start); - boolean wasInConsensus = false; - - for ( int currentPosition = start; currentPosition < end; currentPosition++ ) { - - if ( ! headerElementIterator.hasNext() ) - throw new IllegalStateException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end)); - final HeaderElement headerElement = headerElementIterator.next(); - - if ( headerElement.hasConsensusData(consensusType) ) { - wasInConsensus = true; - - // add to running consensus - if ( consensus == null ) - consensus = createNewConsensus(consensusType, headerElement.getLocation()); - - genericAddBaseToConsensus(consensus, headerElement.getBaseCounts(consensusType)); - - } else { - - // add any outstanding consensus data - if ( wasInConsensus ) { - reads.addAll(finalizeAndAdd(consensus, consensusType)); - consensus = null; - } - - wasInConsensus = false; - } - } - - // add any outstanding consensus data - reads.addAll(finalizeAndAdd(consensus, consensusType)); - - return reads; - } - - private SyntheticRead createNewConsensus(final ConsensusType consensusType, final int start) { - if ( consensusType == ConsensusType.FILTERED ) - return new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, start, hasIndelQualities, SyntheticRead.StrandType.STRANDLESS); - return new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, start, hasIndelQualities, consensusType == ConsensusType.POSITIVE_CONSENSUS ? SyntheticRead.StrandType.POSITIVE : SyntheticRead.StrandType.NEGATIVE); - } - - /** - * Finalizes a synthetic read. - * - * @param consensus the consensus to finalize - * @param type the synthetic reads you want to close - * @return a possibly empty list of GATKSAMRecords generated by finalizing the synthetic reads - */ - private ObjectArrayList finalizeAndAdd(final SyntheticRead consensus, final ConsensusType type) { - - final ObjectArrayList list = new ObjectArrayList<>(); - - final GATKSAMRecord read; - if ( type == ConsensusType.FILTERED ) - read = finalizeFilteredDataConsensus(consensus); - else - read = finalizeRunningConsensus(consensus); - - if ( read != null ) - list.add(read); - - return list; - } - - /** - * Generic accessor to add base and qualities to a synthetic read - * - * @param syntheticRead the synthetic read to add to - * @param baseCounts the base counts object in the header element - */ - private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) { - final BaseIndex base = baseCounts.baseIndexWithMostProbability(); - final int count = baseCounts.countOfBase(base); - final byte qual = baseCounts.averageQualsOfBase(base); - final byte insQual = baseCounts.averageInsertionQualsOfBase(base); - final byte delQual = baseCounts.averageDeletionQualsOfBase(base); - syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS()); - } - - /** - * Method to compress a variant region and return the associated reduced reads - * - * @param start the first window header index in the variant region (inclusive) - * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) - * @return a non-null object representing all reads contained in the variant region - */ - @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures("result != null") - protected CloseVariantRegionResult compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - final CloseVariantRegionResult allReads = new CloseVariantRegionResult(stop); - - // Try to compress into a polyploid consensus - // Optimization: don't bother if there are no known SNPs here - final int hetRefPosition = (knownSnpPositions != null && knownSnpPositions.isEmpty()) ? -1 : findSinglePolyploidCompressiblePosition(start, stop); - - // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by - // insertions (which we don't want because we can't confirm that they represent the same allele). - // Also, we only allow polyploid consensus creation at known sites if provided. - if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) { - // try to create the polyploid consensus - allReads.reads.addAll(createPolyploidConsensus(hetRefPosition)); - allReads.stopPerformed = hetRefPosition; // we stopped at the het position - } - // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them - // from the window header entirely; also remove all reads preceding the variant region (since they will be output - // as consensus right after compression) - else { - final int refStart = windowHeader.get(start).getLocation(); - final int refStop = windowHeader.get(stop).getLocation(); - - final ObjectList toRemoveFromWindow = new ObjectArrayList<>(); - final ObjectList toEmit = new ObjectArrayList<>(); - for ( final GATKSAMRecord read : readsInWindow ) { - if ( read.getSoftStart() <= refStop ) { - if ( read.getAlignmentEnd() >= refStart ) { - toEmit.add(read); - removeFromHeader(windowHeader, read); - } - toRemoveFromWindow.add(read); - } - } - - // remove all used reads - for ( final GATKSAMRecord read : toRemoveFromWindow ) - readsInWindow.remove(read); - - // down-sample the unreduced reads if needed - allReads.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(toEmit) : toEmit); - } - - return allReads; - } - - /** - * Determines whether the given position match one of the known sites - * - * @param targetPosition the position of the het site - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) - * @return true if the targetPosition matches a known SNP position, false otherwise - */ - @Requires({"targetPosition >= 1 && knownSnpPositions != null"}) - protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) { - final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition); - return knownSnpPositions == null || knownSnpPositions.contains(targetLoc); - } - - /* - * Finds the het variant position located within start and stop (inclusive) if one exists. - * - * @param start the first header index in the region to check (inclusive) - * @param stop the last header index of the region to check (inclusive) - * @return the window header index of the single het position or -1 if either none or more than one exists - */ - @Requires("start >= 0 && (stop >= start || stop == 0)") - protected int findSinglePolyploidCompressiblePosition(final int start, final int stop) { - int hetRefPosition = -1; - - for ( int i = start; i <= stop; i++ ) { - - final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT); - - // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios - if ( nAlleles > 2 || nAlleles == -1 ) - return -1; - - if ( nAlleles == 2 ) { - - // make sure that there is only 1 site in the region that contains more than one allele - if ( hetRefPosition != -1 ) - return -1; - - hetRefPosition = i; - } - } - - return hetRefPosition; - } - - /* - * Checks whether there's a position in the header with a significant number of softclips or a variant. - * - * @param header the window header to examine - * @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument) - * @return true if there exists a position with significant softclips, false otherwise - */ - @Requires("header != null") - protected boolean hasPositionWithSignificantSoftclipsOrVariant(final List header, final int positionToSkip) { - - for ( final HeaderElement headerElement : header ) { - - if ( headerElement.getLocation() == positionToSkip ) - continue; - - if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) || - headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) != 1 ) - return true; - } - - return false; - } - - /** - * Finalizes a variant region, any adjacent synthetic reads. - * - * @param start the first window header index in the variant region (inclusive) - * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) - * @return a non-null object representing all reads contained in the variant region plus any adjacent synthetic reads - */ - @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures("result != null") - protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions); - allReads.reads.addAll(addAllSyntheticReadTypes(0, allReads.stopPerformed + 1)); - return allReads; - } - - /** - * Adds reads for all possible strands (positive, negative, filtered) from the global windowHeader object - * - * @param start the start position (inclusive) - * @param end the end position (exclusive) - * @return non-null but possibly empty array list with reduced reads - */ - private ObjectArrayList addAllSyntheticReadTypes(final int start, final int end) { - final ObjectArrayList reads = new ObjectArrayList<>(); - reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.POSITIVE_CONSENSUS)); - reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.NEGATIVE_CONSENSUS)); - reads.addAll(addToSyntheticReads(windowHeader, start, end, ConsensusType.FILTERED)); - return reads; - } - - /* - * @see #closeVariantRegions(CompressionStash, ObjectSortedSet, boolean) with forceCloseFullRegions set to false - */ - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { - return closeVariantRegions(regions, knownSnpPositions, false); - } - - private static final class CloseVariantRegionResult { - final private ObjectList reads = new ObjectArrayList<>(); - private int stopPerformed; - - public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; } - } - - /* - * Finalizes the list of regions requested (and any regions preceding them) - * - * @param regions the list of regions to finalize - * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) - * @param forceCloseFullRegions if true, requires this method to make sure all regions are fully closed; otherwise, we may decide not to close up to the very end (e.g. during het compression) - * @return a non-null set of reduced reads representing the finalized regions - */ - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions, final boolean forceCloseFullRegions) { - final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet<>(new AlignmentStartWithNoTiesComparator()); - if ( !regions.isEmpty() ) { - - int windowHeaderStart = getStartLocation(windowHeader); - HeaderElement lastCleanedElement = null; - - for ( final GenomeLoc region : regions ) { - if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) { - final int start = region.getStart() - windowHeaderStart; - int stop = region.getStop() - windowHeaderStart; - - // make sure the bitset is complete given the region (it might not be in multi-sample mode) - if ( region.getStop() > markedSites.getStartLocation() + markedSites.getVariantSiteBitSet().length - 1 ) - markSites(region.getStop()); - - CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions); - allReads.addAll(closeVariantRegionResult.reads); - - // check whether we didn't close the whole region that was requested - if ( stop > 0 && closeVariantRegionResult.stopPerformed < stop ) { - // we should update the variant sites bitset because the context size's worth of bases after the variant position are no longer "variant" - markRegionAs(closeVariantRegionResult.stopPerformed + 1, stop, false); - - // if the calling method said that it didn't care then we are okay so update the stop - if ( !forceCloseFullRegions ) { - stop = closeVariantRegionResult.stopPerformed; - } - // otherwise, we need to forcibly push the stop that we originally requested - else { - while ( closeVariantRegionResult.stopPerformed < stop ) { - // first clean up used header elements so they don't get reused - for ( int i = 0; i <= closeVariantRegionResult.stopPerformed; i++ ) - windowHeader.remove(); - stop -= (closeVariantRegionResult.stopPerformed + 1); - - closeVariantRegionResult = closeVariantRegion(0, stop, knownSnpPositions); - allReads.addAll(closeVariantRegionResult.reads); - } - } - } - - // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions. - // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads - // (in the case where there are multiple regions to close we'd reuse the reads for each region). - if ( stop >= 0 ) { - for ( int i = 0; i < stop; i++ ) - windowHeader.remove(); - lastCleanedElement = windowHeader.remove(); - windowHeaderStart = getStartLocation(windowHeader); - } - } - } - - // we need to keep the last element of the last cleaned region in the event that the following element has a read that starts with an insertion. - if ( lastCleanedElement != null && lastCleanedElement.hasInsertionToTheRight() ) - windowHeader.addFirst(new HeaderElement(lastCleanedElement.getLocation(), lastCleanedElement.numInsertionsToTheRight())); - } - - return allReads; - } - - /** - * Downsamples a variant region to the downsample coverage of the sliding window. - * - * It will use the downsampling strategy defined by the SlidingWindow - * - * @param allReads a non-null list of reads to select from (all reads that cover the window) - * @return a non-null list of reads selected by the downsampler to cover the window to at least the desired coverage - */ - @Requires({"allReads != null"}) - @Ensures("result != null") - protected ObjectList downsampleVariantRegion(final ObjectList allReads) { - int nReads = allReads.size(); - if (nReads == 0) - return allReads; - - if (downsampleCoverage >= nReads) - return allReads; - - ReservoirDownsampler downsampler = new ReservoirDownsampler<>(downsampleCoverage); - downsampler.submit(allReads); - return new ObjectArrayList<>(downsampler.consumeFinalizedItems()); - } - - - /** - * Properly closes a Sliding Window, finalizing all consensus and variant - * regions that still exist regardless of being able to fulfill the - * context size requirement in the end. - * - * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) - * @return A non-null set/list of all reads generated - */ - @Ensures("result != null") - public Pair, CompressionStash> close(final ObjectSortedSet knownSnpPositions) { - // mark variant regions - ObjectSet finalizedReads = new ObjectAVLTreeSet<>(new AlignmentStartWithNoTiesComparator()); - CompressionStash regions = new CompressionStash(); - - if (!windowHeader.isEmpty()) { - markSites(getStopLocation(windowHeader) + 1); - regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true); - finalizedReads = closeVariantRegions(regions, knownSnpPositions, true); - - if (!windowHeader.isEmpty()) - finalizedReads.addAll(addAllSyntheticReadTypes(0, windowHeader.size())); - } - - return new Pair<>(finalizedReads, regions); - } - - /** - * generates the SAM record for the running consensus read and resets it (to null) - * - * @param runningConsensus the consensus to finalize - * @return the read contained in the running consensus or null - */ - protected GATKSAMRecord finalizeRunningConsensus(final SyntheticRead runningConsensus) { - GATKSAMRecord finalizedRead = null; - - if ( runningConsensus != null ) { - if ( runningConsensus.size() > 0 ) - finalizedRead = runningConsensus.close(); - else - consensusCounter--; - } - - return finalizedRead; - } - - /** - * generates the SAM record for the filtered data consensus and resets it (to null) - * - * @param filteredDataConsensus the consensus to finalize - * @return the read contained in the running consensus or null - */ - protected GATKSAMRecord finalizeFilteredDataConsensus(final SyntheticRead filteredDataConsensus) { - GATKSAMRecord finalizedRead = null; - if (filteredDataConsensus != null) { - if (filteredDataConsensus.size() > 0) - finalizedRead = filteredDataConsensus.close(); - else - filteredDataConsensusCounter--; - } - return finalizedRead; - } - - // define this so that we can use Java generics below - private final static class HeaderElementList extends LinkedList {} - - private final static class SingleStrandConsensusData { - final HeaderElementList consensus = new HeaderElementList(); - final ObjectList reads = new ObjectArrayList<>(); - } - - /** - * Finalizes a variant region - and any adjacent synthetic reads - for point mutations (indel sites are not - * supported) with polyploid compression. - * - * @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE! - * @return a non-null list of all reads contained in the variant region as a polyploid consensus - */ - @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - @Ensures({"result != null"}) - protected ObjectList createPolyploidConsensus(final int hetRefPosition) { - // we will create two (positive strand, negative strand) headers for each haplotype - final SingleStrandConsensusData[] headersPosStrand = new SingleStrandConsensusData[2]; - final SingleStrandConsensusData[] headersNegStrand = new SingleStrandConsensusData[2]; - - final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation(); - - // initialize the mapping from base (allele) to header - final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); - alleleHeaderMap.defaultReturnValue(-1); - for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_ALT_PROPORTION_TO_TRIGGER_VARIANT) ) { - final int currentIndex = alleleHeaderMap.size(); - if ( currentIndex > 1 ) - throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus"); - - alleleHeaderMap.put(allele.b, currentIndex); - headersPosStrand[currentIndex] = new SingleStrandConsensusData(); - headersNegStrand[currentIndex] = new SingleStrandConsensusData(); - } - - // sanity check that we saw 2 alleles - if ( alleleHeaderMap.size() != 2 ) - throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); - - final ObjectList readsToRemove = new ObjectArrayList<>(); - - for ( final GATKSAMRecord read : readsInWindow ) { - - // if the read falls after the het position, just skip it for now (we'll get to it later) - if ( read.getSoftStart() > globalHetRefPosition ) - continue; - - // remove all other reads from the read cache since we're going to use them here - readsToRemove.add(read); - - // if the read falls before the het position or has low MQ, we don't need to look at it - if ( read.getSoftEnd() < globalHetRefPosition || read.getMappingQuality() < MIN_MAPPING_QUALITY) - continue; - - // remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead - removeFromHeader(windowHeader, read); - - // where on the read is the het position? - final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); - - // this is safe because indels are not supported - final byte base = read.getReadBases()[readPosOfHet]; - - // check which allele this read represents - final int allele = alleleHeaderMap.get(base); - - // ignore the read if it represents a base that's not part of the consensus - if ( allele != -1 ) { - // add to the appropriate polyploid header - final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; - header.reads.add(read); - addToHeader(header.consensus, read); - } - } - - for ( final GATKSAMRecord read : readsToRemove ) - readsInWindow.remove(read); - - // create the polyploid synthetic reads if we can - final ObjectList hetReads = new ObjectArrayList<>(); - - // sanity check that no new "variant region" exists on just a single consensus strand due to softclips - // or multi-allelic sites now that we've broken everything out into their component parts. if one does - // exist then we need to back out the consensus for that strand only. - for ( final SingleStrandConsensusData header : headersPosStrand ) { - if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) - hetReads.addAll(header.reads); - else - finalizeHetConsensus(header.consensus, false, hetReads); - } - for ( final SingleStrandConsensusData header : headersNegStrand ) { - if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) - hetReads.addAll(header.reads); - else - finalizeHetConsensus(header.consensus, true, hetReads); - } - - return hetReads; - } - - /* - * Finalizes a particular het consensus for the given header representation - * - * @param header the list of header elements representing the header for the consensus - * @param isNegativeStrand does this header represent reads on the negative strand? - * @param result list in which to store results - */ - protected void finalizeHetConsensus(final LinkedList header, final boolean isNegativeStrand, final ObjectList result) { - if ( header.size() > 0 ) { - if ( isNegativeStrand ) - result.addAll(addToSyntheticReads(header, 0, header.size(), ConsensusType.NEGATIVE_CONSENSUS)); - else - result.addAll(addToSyntheticReads(header, 0, header.size(), ConsensusType.POSITIVE_CONSENSUS)); - } - } - - private void addToHeader(LinkedList header, GATKSAMRecord read) { - updateHeaderCounts(header, read, false); - } - - private void removeFromHeader(LinkedList header, GATKSAMRecord read) { - updateHeaderCounts(header, read, true); - } - - /** - * Updates the sliding window's header counts with the incoming read bases, insertions - * and deletions. - * - * @param header the sliding window header to use - * @param read the incoming read to be added to the sliding window - * @param removeRead if we are removing the read from the header or adding - */ - protected void updateHeaderCounts(final LinkedList header, final GATKSAMRecord read, final boolean removeRead) { - final int readStart = read.getSoftStart(); - final int headerStart = getStartLocation(header); - int locationIndex = headerStart < 0 ? 0 : readStart - headerStart; - - if ( removeRead && locationIndex < 0 ) - throw new IllegalStateException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header)); - - // we only need to create new header elements if we are adding the read, not when we're removing it - if ( !removeRead ) - locationIndex = createNewHeaderElements(header, read, locationIndex); - - actuallyUpdateHeaderForRead(header, read, removeRead, locationIndex); - } - - /* - * Creates new header elements if needed for the given read. - * - * @param header the sliding window header to use - * @param read the incoming read to be added to the sliding window - * @param startIndex the start location index into the header for this read - * - * @return an updated index into the modified header - */ - @Requires("header != null && read != null") - protected int createNewHeaderElements(final LinkedList header, final GATKSAMRecord read, final int startIndex) { - - int headerStart = getStartLocation(header); - int locationIndex = startIndex; - - // Do we need to add extra elements before the start of the header? This could happen if the previous read was - // clipped and this alignment starts before the beginning of the window - final int readStart = read.getSoftStart(); - if ( startIndex < 0 ) { - for ( int i = 1; i <= -startIndex; i++ ) - header.addFirst(new HeaderElement(headerStart - i)); - - // update the start location accordingly - headerStart = readStart; - locationIndex = 0; - } - - // Do we need to add extra elements to the end of the header? - final int headerStop = getStopLocation(header); - final int readEnd = read.getSoftEnd(); - if ( headerStop < readEnd ) { - final int elementsToAdd = (headerStop < 0) ? readEnd - readStart + 1 : readEnd - headerStop; - for ( int i = elementsToAdd - 1; i >= 0; i-- ) - header.addLast(new HeaderElement(readEnd - i)); - } - - // Special case for leading insertions before the beginning of the sliding read - if ( (readStart == headerStart || headerStart < 0) && ReadUtils.readStartsWithInsertion(read.getCigar(), false) != null ) { - // create a new first element to the window header with no bases added - header.addFirst(new HeaderElement(readStart - 1)); - // this allows the first element (I) to look at locationIndex - 1 when we update the header and do the right thing - locationIndex = 1; - } - - return locationIndex; - } - - /* - * Actually updates the sliding window's header counts with the incoming read bases and quals (including insertion and deletion quals). - * - * @param header the sliding window header to use - * @param read the incoming read to be added to the sliding window - * @param removeRead if we are removing the read from the header or adding - * @param startIndex the start location index into the header for this read - */ - @Requires("header != null && read != null && startIndex >= 0") - protected void actuallyUpdateHeaderForRead(final LinkedList header, final GATKSAMRecord read, final boolean removeRead, final int startIndex) { - - final Iterator headerElementIterator = header.listIterator(startIndex); - final int mappingQuality = read.getMappingQuality(); - final boolean isNegativeStrand = read.getReadNegativeStrandFlag(); - - // iterator variables - int locationIndex = startIndex; - int readBaseIndex = 0; - HeaderElement headerElement; - - for ( final CigarElement cigarElement : read.getCigar().getCigarElements() ) { - switch ( cigarElement.getOperator() ) { - case H: - break; - case I: - readBaseIndex += cigarElement.getLength(); - - // special case, if we don't have the previous header element anymore, don't worry about it. - if ( locationIndex == 0 ) - break; - - // insertions are added to the base to the left (previous element) - headerElement = header.get(locationIndex - 1); - - if ( removeRead ) - headerElement.removeInsertionToTheRight(); - else - headerElement.addInsertionToTheRight(); - - break; - case D: - // deletions are added to the baseCounts with the read mapping quality as its quality score - final int nDeletionBases = cigarElement.getLength(); - final byte MQbyte = mappingQuality > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)mappingQuality; - for ( int i = 0; i < nDeletionBases; i++ ) { - headerElement = headerElementIterator.next(); - if (removeRead) - headerElement.removeBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand); - else - headerElement.addBase(BaseUtils.Base.D.base, MQbyte, MQbyte, MQbyte, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false, isNegativeStrand); - } - locationIndex += nDeletionBases; - break; - case S: - case M: - case P: - case EQ: - case X: - final int nBasesToAdd = cigarElement.getLength(); - final boolean isSoftClip = cigarElement.getOperator() == CigarOperator.S; - final byte[] readBases = read.getReadBases(); - final byte[] readQuals = read.getBaseQualities(); - final boolean readHasIndelQuals = read.hasBaseIndelQualities(); - final byte[] insertionQuals = readHasIndelQuals ? read.getBaseInsertionQualities() : null; - final byte[] deletionQuals = readHasIndelQuals ? read.getBaseDeletionQualities() : null; - - for ( int i = 0; i < nBasesToAdd; i++ ) { - headerElement = headerElementIterator.next(); - final byte insertionQuality = readHasIndelQuals ? insertionQuals[readBaseIndex] : -1; - final byte deletionQuality = readHasIndelQuals ? deletionQuals[readBaseIndex] : -1; - - if ( removeRead ) - headerElement.removeBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip, isNegativeStrand); - else - headerElement.addBase(readBases[readBaseIndex], readQuals[readBaseIndex], insertionQuality, deletionQuality, mappingQuality, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, isSoftClip, isNegativeStrand); - - readBaseIndex++; - } - locationIndex += nBasesToAdd; - break; - default: - break; - } - } - } -} - diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java deleted file mode 100644 index 9d16ea06f..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ /dev/null @@ -1,369 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import com.google.java.contract.Requires; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - - -/** - * Running Consensus is a read that is compressed as a sliding window travels over the reads - * and keeps track of all the bases that are outside of variant regions. - * - * Consensus reads have qual fields that correspond to the number of reads that had the base - * and passed the minimum quality threshold. - * - * The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads - * that compose the consensus - * - * @author Mauricio Carneiro - * @since 8/26/11 - */ -public class SyntheticRead { - - /** - * The types of strandedness for synthetic reads - */ - public enum StrandType { - POSITIVE, - NEGATIVE, - STRANDLESS - } - - // Rather than storing a separate list for each attribute in SingleBaseInfo, store one list to reduce memory footprint. - private static class SingleBaseInfo { - byte baseIndexOrdinal; // enum BaseIndex.ordinal - int count; - byte qual; - byte insertionQual; - byte deletionQual; - - SingleBaseInfo(byte baseIndexOrdinal, int count, byte qual, byte insertionQual, byte deletionQual) { - this.baseIndexOrdinal = baseIndexOrdinal; - this.count = count; - this.qual = qual; - this.insertionQual = insertionQual; - this.deletionQual = deletionQual; - } - } - - // This class is merely sharing of code for convertVariableGivenBases(). - private abstract class SingleBaseInfoIterator implements Iterator { - final Iterator it; - - SingleBaseInfoIterator() { - this.it = basesCountsQuals.iterator(); - } - - public boolean hasNext() { - return it.hasNext(); - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } - - - // Map from ordinal to enum value. - private static final BaseIndex[] BaseIndexByOrdinal = new BaseIndex[BaseIndex.values().length]; - static - { - for (final BaseIndex baseIndex : BaseIndex.values()) { - BaseIndexByOrdinal[baseIndex.ordinal()] = baseIndex; - } - } - - - private final ObjectArrayList basesCountsQuals; - private double mappingQuality; - - // Information to produce a GATKSAMRecord - private SAMFileHeader header; - private GATKSAMReadGroupRecord readGroupRecord; - private String contig; - private int contigIndex; - private String readName; - private int refStart; - private boolean hasIndelQualities = false; - private StrandType strandType = StrandType.STRANDLESS; - - /** - * Full initialization of the running consensus if you have all the information and are ready to - * start adding to the running consensus. - * - * @param header GATKSAMRecord file header - * @param readGroupRecord Read Group for the GATKSAMRecord - * @param contig the read's contig name - * @param contigIndex the read's contig index - * @param readName the read's name - * @param refStart the alignment start (reference based) - */ - public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, StrandType strandType) { - final int initialCapacity = 10000; - basesCountsQuals = new ObjectArrayList(initialCapacity); - mappingQuality = 0.0; - - this.header = header; - this.readGroupRecord = readGroupRecord; - this.contig = contig; - this.contigIndex = contigIndex; - this.readName = readName; - this.refStart = refStart; - this.hasIndelQualities = hasIndelQualities; - this.strandType = strandType; - } - - /** - * Easy access to keep adding to a running consensus that has already been - * initialized with the correct read name and refStart - * - * @param base the base to add - * @param count number of reads with this base - */ - @Requires("count <= Byte.MAX_VALUE") - public void add(BaseIndex base, int count, byte qual, byte insQual, byte delQual, double mappingQuality) { - basesCountsQuals.add(new SingleBaseInfo(base.getOrdinalByte(), count, qual, insQual, delQual)); - this.mappingQuality += mappingQuality; - } - - public BaseIndex getBase(final int readCoordinate) { - return BaseIndexByOrdinal[basesCountsQuals.get(readCoordinate).baseIndexOrdinal]; - } - - public int getRefStart() { - return refStart; - } - - /** - * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid. - * - * Invalid reads are : - * - exclusively composed of deletions - * - * @return a GATKSAMRecord or null - */ - public GATKSAMRecord close () { - if (isAllDeletions()) - return null; - - GATKSAMRecord read = new GATKSAMRecord(header); - read.setReferenceName(contig); - read.setReferenceIndex(contigIndex); - read.setReadPairedFlag(false); - read.setReadUnmappedFlag(false); - if ( strandType != StrandType.STRANDLESS ) { - read.setAttribute(GATKSAMRecord.REDUCED_READ_STRANDED_TAG, '1'); // must come before next line - read.setReadNegativeStrandFlag(strandType == StrandType.NEGATIVE); - } - read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) - read.setAlignmentStart(refStart); - read.setReadName(readName); - read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION); - read.setReadBases(convertReadBases()); - read.setMappingQuality((int) Math.ceil(mappingQuality / basesCountsQuals.size())); - read.setReadGroup(readGroupRecord); - read.setReducedReadCountsTag(convertBaseCounts()); - - if (hasIndelQualities) { - read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION); - read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION); - } - - return read; - } - - /** - * Checks if the synthetic read is composed exclusively of deletions - * - * @return true if it is, false if it isn't. - */ - private boolean isAllDeletions() { - for (SingleBaseInfo b : basesCountsQuals) - if (b.baseIndexOrdinal != BaseIndex.D.getOrdinalByte()) - return false; - return true; - } - - public int size () { - return basesCountsQuals.size(); - } - - private byte [] convertBaseQualities() { - return convertVariableGivenBases(new SingleBaseInfoIterator() { - public Byte next() { - return it.next().qual; - } - }); - } - - private byte [] convertInsertionQualities() { - return convertVariableGivenBases(new SingleBaseInfoIterator() { - public Byte next() { - return it.next().insertionQual; - } - }); - } - - private byte [] convertDeletionQualities() { - return convertVariableGivenBases(new SingleBaseInfoIterator() { - public Byte next() { - return it.next().deletionQual; - } - }); - } - - protected int[] convertBaseCounts() { - int[] variableArray = new int[getReadLengthWithNoDeletions()]; - int i = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte()) - variableArray[i++] = singleBaseInfo.count; - } - return variableArray; - } - - private byte [] convertReadBases() { - byte [] readArray = new byte[getReadLengthWithNoDeletions()]; - int i = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - final BaseIndex baseIndex = BaseIndexByOrdinal[singleBaseInfo.baseIndexOrdinal]; - if (baseIndex != BaseIndex.D) - readArray[i++] = baseIndex.getByte(); - } - - return readArray; - } - - /** - * Builds the cigar string for the synthetic read - * - * Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read. - * - * @return the cigar string for the synthetic read - */ - private Cigar buildCigar() { - ObjectArrayList cigarElements = new ObjectArrayList(); - CigarOperator cigarOperator = null; - int length = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - final BaseIndex b = BaseIndexByOrdinal[singleBaseInfo.baseIndexOrdinal]; - CigarOperator op; - switch (b) { - case D: - op = CigarOperator.DELETION; - break; - case I: - throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported."); - default: - op = CigarOperator.MATCH_OR_MISMATCH; - break; - } - if (cigarOperator == null) { - if (op == CigarOperator.D) // read cannot start with a deletion - refStart++; // if it does, we need to move the reference start forward - else - cigarOperator = op; - } - else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one - cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator - cigarOperator = op; - length = 0; - } - - if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions) - length++; - } - if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion - cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element - - return new Cigar(cigarElements); - } - - /** - * Shared functionality for all conversion utilities - * - * @param variableIterator the list to convert - * @return a converted variable given the bases and skipping deletions - */ - - private byte [] convertVariableGivenBases (Iterator variableIterator) { - byte [] variableArray = new byte[getReadLengthWithNoDeletions()]; - int i = 0; - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) { - byte count = variableIterator.next(); - if (singleBaseInfo.baseIndexOrdinal != BaseIndex.D.getOrdinalByte()) - variableArray[i++] = count; - } - return variableArray; - } - - /** - * Shared functionality for all conversion utilities - * - * @return the length of the read with no deletions - */ - private int getReadLengthWithNoDeletions() { - int readLength = basesCountsQuals.size(); - for (final SingleBaseInfo singleBaseInfo : basesCountsQuals) - if (singleBaseInfo.baseIndexOrdinal == BaseIndex.D.getOrdinalByte()) - readLength--; - return readLength; - } - - -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index f3b26f295..77c51f88b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -187,18 +187,6 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { if ( qual == 0 ) return 0; - if ( elt.getRead().isReducedRead() ) { - // reduced read representation - if ( BaseUtils.isRegularBase( obsBase )) { - int representativeCount = elt.getRepresentativeCount(); - add(obsBase, qual, (byte)0, (byte)0, representativeCount); // fast calculation of n identical likelihoods - return representativeCount; // we added nObs bases here - } - - // odd bases or deletions => don't use them - return 0; - } - return add(obsBase, qual, (byte)0, (byte)0, 1); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index 7ce736b0c..a57502bc0 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -140,11 +140,10 @@ public class ErrorModel { Allele refAllele = refSampleVC.getReference(); if ( refSampleVC.isIndel()) { - final int readCounts[] = new int[refSamplePileup.getNumberOfElements()]; //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); if (!haplotypeMap.isEmpty()) - perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); + perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap); } int idx = 0; for (PileupElement refPileupElement : refSamplePileup) { diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index 2f2a93fa4..530ba3ef8 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -193,8 +193,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype if (!hasReferenceSampleData) { - final int readCounts[] = new int[pileup.getNumberOfElements()]; - readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts); + readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap); n = readHaplotypeLikelihoods.length; } else { Allele refAllele = null; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index f48ae81cf..95d3fb78b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -132,7 +132,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { int count = 0; for ( PileupElement p : pileup ) { if ( BaseUtils.isRegularBase( p.getBase() ) ) - count += p.getRepresentativeCount(); + count++; } return count; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 4a3231b3e..ae2ea427b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -253,7 +253,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood int count = 0; for (PileupElement p : pileup) { if (p.isDeletion() || BaseUtils.isRegularBase(p.getBase())) - count += p.getRepresentativeCount(); + count++; } return count; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index aa334f680..c5070a76f 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -608,7 +608,7 @@ public class UnifiedGenotyperEngine { int numDeletions = 0; for ( final PileupElement p : rawContext.getBasePileup() ) { if ( p.isDeletion() ) - numDeletions += p.getRepresentativeCount(); + numDeletions++; } if ( ((double) numDeletions) / ((double) rawContext.getBasePileup().depthOfCoverage()) > UAC.MAX_DELETION_FRACTION ) { return null; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index b1db23d74..55a1c5dba 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -59,7 +59,6 @@ import org.broadinstitute.sting.utils.pairhmm.*; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatCovariate; import org.broadinstitute.sting.utils.recalibration.covariates.RepeatLengthCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.*; import java.io.File; @@ -419,8 +418,7 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation for( final Map.Entry> entry : stratifiedReadMap.get(sample).getLikelihoodReadMap().entrySet() ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. - haplotypeLikelihood += ReadUtils.getMeanRepresentativeReadCount( entry.getKey() ) * - ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF ); + haplotypeLikelihood += ( MathUtils.approximateLog10SumLog10(entry.getValue().get(iii_allele), entry.getValue().get(jjj_allele)) + MathUtils.LOG_ONE_HALF ); } } haplotypeLikelihoodMatrix[iii][jjj] = haplotypeLikelihood; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java index e1471ab33..a48ac9ee0 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReadErrorCorrector.java @@ -233,10 +233,6 @@ public class ReadErrorCorrector { */ @Requires("inputRead != null") private GATKSAMRecord correctRead(final GATKSAMRecord inputRead) { - // no support for reduced reads (which shouldn't need to be error-corrected anyway!) - if (inputRead.isReducedRead()) - return inputRead; - // do actual correction boolean corrected = false; final byte[] correctedBases = inputRead.getReadBases(); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java index 7f7e65817..5ef310498 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModel.java @@ -297,13 +297,13 @@ public class ReferenceConfidenceModel { if( hqSoftClips != null && p.isNextToSoftClip() ) { hqSoftClips.add(AlignmentUtils.calcNumHighQualitySoftClips(p.getRead(), (byte) 28)); } - result.AD_Ref_Any[1] += p.getRepresentativeCount(); + result.AD_Ref_Any[1]++; } else { - result.AD_Ref_Any[0] += p.getRepresentativeCount(); + result.AD_Ref_Any[0]++; } - result.genotypeLikelihoods[AA] += p.getRepresentativeCount() * QualityUtils.qualToProbLog10(qual); - result.genotypeLikelihoods[AB] += p.getRepresentativeCount() * MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); - result.genotypeLikelihoods[BB] += p.getRepresentativeCount() * QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; + result.genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual); + result.genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + MathUtils.LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD + MathUtils.LOG_ONE_HALF ); + result.genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + MathUtils.LOG_ONE_THIRD; } } @@ -484,7 +484,7 @@ public class ReferenceConfidenceModel { // todo -- this code really should handle CIGARs directly instead of relying on the above tests if ( isReadInformativeAboutIndelsOfSize(read.getReadBases(), read.getBaseQualities(), offset, ref, pileupOffsetIntoRef, maxIndelSize) ) { - nInformative += p.getRepresentativeCount(); + nInformative++; if( nInformative > MAX_N_INDEL_INFORMATIVE_READS ) { return MAX_N_INDEL_INFORMATIVE_READS; } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java index 55ff2f978..6574e8295 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java @@ -239,14 +239,14 @@ public class HaplotypeGraph extends ReadThreadingGraph { super(kmerSize); referenceHaplotype = findReferenceHaplotypeOrFail(haplotypes); this.haplotypes = new LinkedHashSet<>(haplotypes); - addSequence("anonymous", referenceHaplotype.getBases(), null, true); + addSequence("anonymous", referenceHaplotype.getBases(), true); for (final Haplotype h : haplotypes) { if (h.isReference()) continue; if (h.length() < kmerSize) { Utils.warnUser(logger, "haplotype shorter than kmerSize " + h.length() + " < " + kmerSize + " will be dropped"); } else - addSequence("anonymous", h.getBases(), null, false); + addSequence("anonymous", h.getBases(), false); } buildGraphIfNecessary(); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index f33a4883f..e158ef613 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -151,14 +151,12 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, minBaseQualityToUseInAssembly, numPruningSamples); // add the reference sequence to the graph - rtgraph.addSequence("ref", refHaplotype.getBases(), null, true); + rtgraph.addSequence("ref", refHaplotype.getBases(), true); // add the artificial GGA haplotypes to the graph int hapCount = 0; for ( final Haplotype h : activeAlleleHaplotypes ) { - final int[] counts = new int[h.length()]; - Arrays.fill(counts, GGA_MODE_ARTIFICIAL_COUNTS); - rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), counts, false); + rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false); } // Next pull kmers out of every read and throw them on the graph diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index dc057294e..7fdfa4301 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -199,16 +199,25 @@ public class ReadThreadingGraph extends BaseGraph start + kmerSize in seqForKmers - */ - private int getCountGivenKmerStart(final SequenceForKmers seqForKmers, final int kmerStart) { - return seqForKmers.getCount(kmerStart + kmerSize - 1); + sampleSequences.add(new SequenceForKmers(seqName, sequence, start, stop, count, isRef)); } /** @@ -276,9 +272,7 @@ public class ReadThreadingGraph extends BaseGraph %s via %s at %d with suffix %s vs. %s", // prev, vertex, edge, offset, (char)suffix, (char)seqBase)); if ( suffix == seqBase && (increaseCountsThroughBranches || inDegreeOf(vertex) == 1) ) { - edge.incMultiplicity(seqForKmers.getCount(offset)); + edge.incMultiplicity(seqForKmers.count); increaseCountsInMatchedKmers(seqForKmers, prev, originalKmer, offset-1); } } @@ -780,7 +774,7 @@ public class ReadThreadingGraph extends BaseGraph 1 for reduced reads) + * @param count the number of observations of this kmer in graph (can be > 1 for GGA) * @param isRef is this the reference sequence? * @return a non-null vertex connecting prevVertex to in the graph based on sequence */ @@ -819,7 +813,6 @@ public class ReadThreadingGraph extends BaseGraph= kmerSize ) { // if the sequence is long enough to get some value out of, add it to the graph final String name = read.getReadName() + "_" + start + "_" + end; - addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, reducedReadCounts, false); + addSequence(name, read.getReadGroup().getSample(), read.getReadBases(), start, end, 1, false); } lastGood = -1; // reset the last good base diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java index a4bc0c1c8..e55772657 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmers.java @@ -58,36 +58,23 @@ final class SequenceForKmers { final String name; final byte[] sequence; final int start, stop; - final private int[] counts; + final int count; final boolean isRef; /** * Create a new sequence for creating kmers */ - SequenceForKmers(final String name, byte[] sequence, int start, int stop, int[] counts, boolean ref) { + SequenceForKmers(final String name, byte[] sequence, int start, int stop, int count, boolean ref) { if ( start < 0 ) throw new IllegalArgumentException("Invalid start " + start); if ( stop < start ) throw new IllegalArgumentException("Invalid stop " + stop); if ( sequence == null ) throw new IllegalArgumentException("Sequence is null "); - if ( counts != null && counts.length != sequence.length ) throw new IllegalArgumentException("Sequence and counts don't have the same length " + sequence.length + " vs " + counts.length); + if ( count < 1 ) throw new IllegalArgumentException("Invalid count " + count); this.name = name; this.sequence = sequence; this.start = start; this.stop = stop; + this.count = count; this.isRef = ref; - this.counts = counts; - } - - /** - * Get the number of observations of the kmer starting at i in this sequence - * - * Can we > 1 because sequence may be a reduced read and therefore count as N observations - * - * @param i the offset into sequence for the start of the kmer - * @return a count >= 1 that indicates the number of observations of kmer starting at i in this sequence. - */ - public int getCount(final int i) { - if ( i < 0 || i > sequence.length ) throw new ArrayIndexOutOfBoundsException("i must be >= 0 and <= " + sequence.length + " but got " + i); - return counts == null ? 1 : counts[i]; } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 318779cd2..aa8b46312 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -261,10 +261,9 @@ public class PairHMMIndelErrorModel { final double downsamplingFraction) { final int numHaplotypes = haplotypeMap.size(); - final int readCounts[] = new int[pileup.getNumberOfElements()]; - final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap); perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction); - return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + return getDiploidHaplotypeLikelihoods(numHaplotypes, readLikelihoods); } @@ -295,16 +294,13 @@ public class PairHMMIndelErrorModel { final LinkedHashMap haplotypeMap, final ReferenceContext ref, final int eventLength, - final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, - final int[] readCounts) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; final LinkedList readList = new LinkedList<>(); final Map readGCPArrayMap = new LinkedHashMap<>(); int readIdx=0; for (PileupElement p: pileup) { - // > 1 when the read is a consensus read representing multiple independent observations - readCounts[readIdx] = p.getRepresentativeCount(); // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { @@ -499,7 +495,7 @@ public class PairHMMIndelErrorModel { // return b1.length; // } - private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { + private static double[] getDiploidHaplotypeLikelihoods(final int numHaplotypes, final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix @@ -515,8 +511,7 @@ public class PairHMMIndelErrorModel { continue; final double li = readLikelihoods[readIdx][i]; final double lj = readLikelihoods[readIdx][j]; - final int readCount = readCounts[readIdx]; - haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF); + haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(li, lj) + MathUtils.LOG_ONE_HALF; } } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java deleted file mode 100644 index 2f8295008..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java +++ /dev/null @@ -1,175 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.ReadFilters; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.HashSet; -import java.util.Set; - -/** - * Emits intervals present in either the original or reduced bam but not the other. - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals present in one bam but not the other. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedCoverage \
- *   -o output.intervals
- * 
- * - * @author ebanks - */ -@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) -@Hidden -public class AssessReducedCoverage extends LocusWalker implements TreeReducible { - - private static final String original = "original"; - private static final String reduced = "reduced"; - - @Output - protected PrintStream out; - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) - public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; - - public void initialize() {} - - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - if ( tracker == null ) - return null; - - final Set tags = getAllTags(context.getBasePileup()); - return (tags.contains(original) && !tags.contains(reduced)) || - (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; - } - - private Set getAllTags(final ReadBackedPileup pileup) { - - final Set tags = new HashSet(10); - - for ( final PileupElement p : pileup ) { - if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) - tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); - } - - return tags; - } - - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - public GenomeLoc reduceInit() { - return null; - } - - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java deleted file mode 100644 index 25f6f874d..000000000 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ /dev/null @@ -1,208 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.LocusWalker; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; - -import java.io.PrintStream; -import java.util.List; - -/** - * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of - * the reduced bam are above sufficient threshold) - * - *

Input

- *

- * The original and reduced BAM files. - *

- * - *

Output

- *

- * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. - *

- * - *

Examples

- *
- * java -Xmx2g -jar GenomeAnalysisTK.jar \
- *   -I:original original.bam \
- *   -I:reduced reduced.bam \
- *   -R ref.fasta \
- *   -T AssessReducedQuals \
- *   -o output.intervals
- * 
- * - * @author ami - */ -@Hidden -public class AssessReducedQuals extends LocusWalker implements TreeReducible { - - private static final String reduced = "reduced"; - private static final int originalQualsIndex = 0; - private static final int reducedQualsIndex = 1; - - @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) - public int sufficientQualSum = 600; - - @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false) - public double qual_epsilon = 0.10; - - @Argument(fullName = "exclude_low_mq", shortName = "excludeMQ", doc = "ignore reads with mapping quality below this number", required = false) - public int excludeMQ = 0; - - @Output - protected PrintStream out; - - public void initialize() { - if ( qual_epsilon < 0.0 || qual_epsilon > 1.0 ) - throw new UserException.BadArgumentValue("qual_epsilon", "must be a number between 0 and 1"); - } - - @Override - public boolean includeReadsWithDeletionAtLoci() { return true; } - - @Override - public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) - return null; - - boolean reportLocus; - final int[] quals = getPileupQuals(context.getBasePileup()); - final int epsilon = MathUtils.fastRound(quals[originalQualsIndex] * qual_epsilon); - final int calcOriginalQuals = Math.min(quals[originalQualsIndex], sufficientQualSum); - final int calcReducedQuals = Math.min(quals[reducedQualsIndex], sufficientQualSum); - final int originalReducedQualDiff = calcOriginalQuals - calcReducedQuals; - reportLocus = originalReducedQualDiff > epsilon || originalReducedQualDiff < -1 * epsilon; - - return reportLocus ? ref.getLocus() : null; - } - - /** - * Get the quals separated by version and strand - * @param readPileup the pileup - * @return 2x2 array with sum of quals separated by version in 1st dimension and strand in the 2nd - */ - private int[] getPileupQuals(final ReadBackedPileup readPileup) { - - final int[] quals = new int[2]; - - for ( final PileupElement p : readPileup ) { - final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); - if ( isGoodRead(p) ) { - final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); - final int tagIndex = getTagIndex(tags); - quals[tagIndex] += tempQual; - } - } - - return quals; - } - - private boolean isGoodRead(final PileupElement p) { - return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= excludeMQ; - } - - private int getTagIndex(final List tags) { - return tags.contains(reduced) ? 1 : 0; - } - - @Override - public void onTraversalDone(GenomeLoc sum) { - if ( sum != null ) - out.println(sum); - } - - @Override - public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { - if ( lhs == null ) - return rhs; - - if ( rhs == null ) - return lhs; - - // if contiguous, just merge them - if ( lhs.contiguousP(rhs) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); - - // otherwise, print the lhs and start over with the rhs - out.println(lhs); - return rhs; - } - - @Override - public GenomeLoc reduceInit() { - return null; - } - - @Override - public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { - if ( value == null ) - return sum; - - if ( sum == null ) - return value; - - // if contiguous, just merge them - if ( sum.contiguousP(value) ) - return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); - - // otherwise, print the sum and start over with the value - out.println(sum); - return value; - } -} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 56f7e8257..325237d05 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -48,12 +48,10 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.compression.reducereads.ReduceReads; import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.BaseUtils; @@ -1063,20 +1061,4 @@ public class RecalUtils { private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { return new RecalDatum(1, isError, reportedQual); } - - /** - * Checks for invalid BAMs that are being used with BQSR and fails with a UserException if it finds one - * - * @param headers sam file headers being passed into the GATK engine - * @param allowBqsrOnReducedBams should we allow BQSR on reduced bams? - */ - public static void checkForInvalidRecalBams(final List headers, final boolean allowBqsrOnReducedBams) { - // for now, the only check we make is against reduced bams - if ( !allowBqsrOnReducedBams ) { - for ( final SAMFileHeader header : headers ) { - if ( header.getProgramRecord(ReduceReads.PROGRAM_RECORD_NAME) != null ) - throw new UserException.BadInput("base quality score recalibration should absolutely not be run on reduced BAM files! Please run ReduceReads only after BQSR has been performed"); - } - } - } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java index fec83e1a8..b1c280748 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/annotator/RankSumUnitTest.java @@ -46,8 +46,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import org.broadinstitute.sting.gatk.walkers.compression.reducereads.*; -import org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts; import org.broadinstitute.sting.utils.MannWhitneyU; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -122,13 +120,15 @@ public class RankSumUnitTest { final List dist2 = new ArrayList<>(distribution2); if ( numToReduceIn2 > 0 ) { - final org.broadinstitute.sting.gatk.walkers.compression.reducereads.BaseCounts counts = new BaseCounts(); + int counts = 0; + int quals = 0; + for ( int i = 0; i < numToReduceIn2; i++ ) { - final int value = dist2.remove(0); - counts.incr(BaseIndex.A, (byte)value, 0, false); + counts++; + quals += dist2.remove(0); } - final int qual = (int)counts.averageQualsOfBase(BaseIndex.A); + final int qual = quals / counts; for ( int i = 0; i < numToReduceIn2; i++ ) dist2.add(qual); } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java deleted file mode 100644 index f988471a0..000000000 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java +++ /dev/null @@ -1,201 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - - -import org.broadinstitute.sting.BaseTest; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Basic unit test for BaseCounts in reduced reads - */ -public class BaseCountsUnitTest extends BaseTest { - - private class BaseCountsTest { - public String bases; - public byte mostCountBase; - public int mostCommonCount; - - private BaseCountsTest(String bases, char mostCountBase, int mostCommonCount) { - this.mostCommonCount = mostCommonCount; - this.mostCountBase = (byte)mostCountBase; - this.bases = bases; - } - } - - @DataProvider(name = "counting") - public Object[][] createCountingData() { - List params = new ArrayList(); - - params.add(new BaseCountsTest("A", 'A', 1 )); - params.add(new BaseCountsTest("AA", 'A', 2 )); - params.add(new BaseCountsTest("AC", 'A', 1 )); - params.add(new BaseCountsTest("AAC", 'A', 2 )); - params.add(new BaseCountsTest("AAA", 'A', 3 )); - params.add(new BaseCountsTest("AAAN", 'A', 3 )); - params.add(new BaseCountsTest("AAANNNN", 'N', 4 )); - params.add(new BaseCountsTest("AACTG", 'A', 2 )); - params.add(new BaseCountsTest("D", 'D', 1 )); - params.add(new BaseCountsTest("DDAAD", 'D', 3)); - params.add(new BaseCountsTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 )); - params.add(new BaseCountsTest("AAIIIAI", 'I', 4 )); - - List params2 = new ArrayList(); - for ( BaseCountsTest x : params ) params2.add(new Object[]{x}); - return params2.toArray(new Object[][]{}); - } - - @Test(dataProvider = "counting", enabled = true) - public void testCounting(BaseCountsTest params) { - BaseCounts counts = new BaseCounts(); - - for ( byte base : params.bases.getBytes() ) - counts.incr(base); - - String name = String.format("Test-%s", params.bases); - Assert.assertEquals(counts.totalCount(), params.bases.length(), name); - Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name); - Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); - - // test the static creation - final int[] countsArray = new int[] { counts.countOfBase(BaseIndex.A), counts.countOfBase(BaseIndex.C), - counts.countOfBase(BaseIndex.G), counts.countOfBase(BaseIndex.T)}; - final BaseCounts countsFromArray = BaseCounts.createWithCounts(countsArray); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A)); - Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C)); - Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G)); - Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T)); - Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount()); - - // test addition - counts.add(countsFromArray); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), 2 * countsFromArray.countOfBase(BaseIndex.A)); - Assert.assertEquals(counts.countOfBase(BaseIndex.C), 2 * countsFromArray.countOfBase(BaseIndex.C)); - Assert.assertEquals(counts.countOfBase(BaseIndex.G), 2 * countsFromArray.countOfBase(BaseIndex.G)); - Assert.assertEquals(counts.countOfBase(BaseIndex.T), 2 * countsFromArray.countOfBase(BaseIndex.T)); - Assert.assertEquals(ACGTcounts(counts), 2 * countsFromArray.totalCount()); - - // test subtraction - counts.sub(countsFromArray); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A)); - Assert.assertEquals(counts.countOfBase(BaseIndex.C), countsFromArray.countOfBase(BaseIndex.C)); - Assert.assertEquals(counts.countOfBase(BaseIndex.G), countsFromArray.countOfBase(BaseIndex.G)); - Assert.assertEquals(counts.countOfBase(BaseIndex.T), countsFromArray.countOfBase(BaseIndex.T)); - Assert.assertEquals(ACGTcounts(counts), countsFromArray.totalCount()); - - // test decrementing - if ( counts.countOfBase(BaseIndex.A) > 0 ) { - counts.decr((byte)'A'); - Assert.assertEquals(counts.countOfBase(BaseIndex.A), countsFromArray.countOfBase(BaseIndex.A) - 1); - } - } - - private static int ACGTcounts(final BaseCounts baseCounts) { - return baseCounts.totalCountWithoutIndels() - baseCounts.countOfBase(BaseIndex.N); - } - - - ////////////////////////////////// - // TEST FOR QUALS IN BASECOUNTS // - ////////////////////////////////// - - private class BaseCountsQualsTest { - public final List quals; - - private BaseCountsQualsTest(final List quals) { - this.quals = quals; - } - } - - @DataProvider(name = "quals") - public Object[][] createQualsData() { - List tests = new ArrayList(); - - final int[] quals = new int[]{ 0, 5, 10, 15, 20, 30, 40, 50 }; - - for ( final int qual1 : quals ) { - for ( final int qual2 : quals ) { - for ( final int qual3 : quals ) { - tests.add(new Object[]{new BaseCountsQualsTest(Arrays.asList(qual1, qual2, qual3))}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "quals", enabled = true) - public void testQuals(BaseCountsQualsTest test) { - BaseCounts counts = new BaseCounts(); - - for ( int qual : test.quals ) - counts.incr(BaseIndex.A, (byte)qual, 20, false); - - final int actualSum = (int)counts.getSumQuals((byte)'A'); - final int expectedSum = qualSum(test.quals); - Assert.assertEquals(actualSum, expectedSum); - - final int actualAverage = (int)counts.averageQuals((byte)'A'); - Assert.assertEquals(actualAverage, expectedSum / test.quals.size()); - - // test both proportion methods - Assert.assertEquals(counts.baseCountProportion(BaseIndex.A), counts.baseCountProportion((byte)'A')); - } - - private static int qualSum(final List quals) { - int sum = 0; - for ( final int qual : quals ) - sum += qual; - return sum; - } -} \ No newline at end of file diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java deleted file mode 100644 index 4f5b7477c..000000000 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.MathUtils; -import org.testng.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; - -public class HeaderElementUnitTest extends BaseTest { - - private class HETest { - public byte base, baseQual, insQual, delQual; - public int MQ; - public boolean isClip; - - private HETest(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int MQ, final boolean isClip) { - this.base = base; - this.baseQual = baseQual; - this.insQual = insQual; - this.delQual = delQual; - this.MQ = MQ; - this.isClip = isClip; - } - } - - private static final byte byteA = (byte)'A'; - private static final byte byte10 = (byte)10; - private static final byte byte20 = (byte)20; - private static final int minBaseQual = 20; - private static final int minMappingQual = 20; - - @DataProvider(name = "data") - public Object[][] createData() { - List tests = new ArrayList(); - - tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, false)}); - tests.add(new Object[]{new HETest(byteA, byte10, byte20, byte20, 20, false)}); - tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 10, false)}); - tests.add(new Object[]{new HETest(byteA, byte20, byte20, byte20, 20, true)}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "data", enabled = true) - public void testHE(HETest test) { - - HeaderElement headerElement = new HeaderElement(1000, 0); - - // first test that if we add and then remove it, we have no data - headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); - headerElement.addInsertionToTheRight(); - headerElement.removeBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); - headerElement.removeInsertionToTheRight(); - testHeaderIsEmpty(headerElement); - - // now, test that the data was added as expected - for ( int i = 0; i < 10; i++ ) - headerElement.addBase(test.base, test.baseQual, test.insQual, test.delQual, test.MQ, minBaseQual, minMappingQual, test.isClip, false); - testHeaderData(headerElement, test); - - // test the insertion adding functionality - for ( int i = 0; i < 10; i++ ) - headerElement.addInsertionToTheRight(); - Assert.assertEquals(headerElement.numInsertionsToTheRight(), 10); - } - - private void testHeaderIsEmpty(final HeaderElement headerElement) { - Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS)); - Assert.assertFalse(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED)); - Assert.assertFalse(headerElement.hasInsertionToTheRight()); - Assert.assertTrue(headerElement.isEmpty()); - } - - private void testHeaderData(final HeaderElement headerElement, final HETest test) { - Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip); - Assert.assertFalse(headerElement.isEmpty()); - Assert.assertFalse(headerElement.hasInsertionToTheRight()); - Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS), test.MQ >= minMappingQual); - Assert.assertEquals(headerElement.hasConsensusData(SlidingWindow.ConsensusType.FILTERED), test.MQ < minMappingQual); - Assert.assertEquals(headerElement.getBaseCounts(headerElement.hasConsensusData(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS) ? SlidingWindow.ConsensusType.POSITIVE_CONSENSUS : SlidingWindow.ConsensusType.FILTERED).getRMS(), (double)test.MQ); - Assert.assertFalse(headerElement.isVariantFromMismatches(0.05, 0.05)); - Assert.assertEquals(headerElement.isVariant(0.05, 0.05, 0.05), test.isClip); - } - - - private class AllelesTest { - public final int[] counts; - public final double pvalue; - - private AllelesTest(final int[] counts, final double pvalue) { - this.counts = counts; - this.pvalue = pvalue; - } - } - - @DataProvider(name = "alleles") - public Object[][] createAllelesData() { - List tests = new ArrayList<>(); - - final int[] counts = new int[]{ 0, 5, 10, 15, 20 }; - final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 }; - - for ( final int countA : counts ) { - for ( final int countC : counts ) { - for ( final int countG : counts ) { - for ( final int countT : counts ) { - for ( final int countD : counts ) { - for ( final double pvalue : pvalues ) { - tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, pvalue)}); - } - } - } - } - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "alleles", enabled = true) - public void testAlleles(AllelesTest test) { - - HeaderElement headerElement = new HeaderElement(1000, 0); - for ( int i = 0; i < test.counts.length; i++ ) { - final BaseIndex base = BaseIndex.values()[i]; - for ( int j = 0; j < test.counts[i]; j++ ) - headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false, false); - } - - final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue, test.pvalue); - final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue); - - Assert.assertEquals(nAllelesSeen, nAllelesExpected); - } - - private static int calculateExpectedAlleles(final int[] counts, final double targetPvalue) { - int total = 0; - for ( final int count : counts ) { - total += count; - } - - int result = 0; - for ( int index = 0; index < counts.length; index++ ) { - final int count = counts[index]; - if ( count == 0 ) - continue; - - final boolean isSignificant; - if ( count <= HeaderElement.MIN_COUNT_FOR_USING_PVALUE ) { - isSignificant = MathUtils.binomialCumulativeProbability(total, 0, count) > targetPvalue; - } else { - isSignificant = (count >= targetPvalue * total); - } - - if ( isSignificant ) { - if ( index == BaseIndex.D.index ) - return -1; - result++; - } - } - - return result; - } -} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java deleted file mode 100644 index 067f36d58..000000000 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ /dev/null @@ -1,347 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; -import org.testng.Assert; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -public class ReduceReadsIntegrationTest extends WalkerTest { - final static String REF = b37KGReference; - final static String DBSNP = b37dbSNP132; - final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; - final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam"; - final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam"; - final String STASH_L = " -L 14:73718184-73718284 -L 14:73718294-73718330 -L 14:73718360-73718556"; - final String DIVIDEBYZERO_BAM = validationDataLocation + "ReduceReadsDivideByZeroBug.bam"; - final String DIVIDEBYZERO_L = " -L " + validationDataLocation + "ReduceReadsDivideByZeroBug.intervals"; - final String L = " -L 20:10,100,000-10,120,000 "; - final String COREDUCTION_BAM_A = validationDataLocation + "coreduction.test.A.bam"; - final String COREDUCTION_BAM_B = validationDataLocation + "coreduction.test.B.bam"; - final String COREDUCTION_L = " -L 1:1,853,860-1,854,354 -L 1:1,884,131-1,892,057"; - final String OFFCONTIG_BAM = privateTestDir + "readOffb37contigMT.bam"; - final String HIGH_COVERAGE_BAM = privateTestDir + "NA20313.highCoverageRegion.bam"; - final String HIGH_COVERAGE_L = " -L 1:1650830-1650870"; - final String BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM = privateTestDir + "bothEndsOfPairInVariantRegion.bam"; - final String INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM = privateTestDir + "rr-too-many-insertions.bam"; - - final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e"; - - protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) { - return executeTest(name, spec, emptyFileMd5); - } - - protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final String qualsTestMD5) { - final Pair, List> result = super.executeTest(name, spec); - - // perform some Reduce Reads specific testing now - if ( result != null ) { - - // generate a new command-line based on the old one - spec.disableImplicitArgs(); - final String[] originalArgs = spec.getArgsWithImplicitArgs().split(" "); - - final StringBuilder reducedInputs = new StringBuilder(); - for ( final File file : result.getFirst() ) { - reducedInputs.append(" -I:reduced "); - reducedInputs.append(file.getAbsolutePath()); - } - - // the coverage test is a less stricter version of the quals test so we can safely ignore it for now - //final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs); - //super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); - - // run the quals test - final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs); - super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(qualsTestMD5))); - } - - return result; - } - - /* - * Generate a new command-line based on the old one - * - * @param walkerName the new walker name to use - * @param originalArgs the original arguments used for the test - * @return the new command line - */ - private String createCommandLine(final String walkerName, final String[] originalArgs) { - - final StringBuilder newArgs = new StringBuilder(); - - for ( int i = 0; i < originalArgs.length; i++ ) { - final String arg = originalArgs[i]; - - if ( arg.equals("-T") ) { - newArgs.append("-T "); - newArgs.append(walkerName); - } else if ( arg.startsWith("-I") ) { - newArgs.append("-I:original "); - newArgs.append(originalArgs[++i]); - } else if ( arg.equals("-R") || arg.equals("-L") ) { - newArgs.append(arg); - newArgs.append(" "); - newArgs.append(originalArgs[++i]); - } - - // always add a trailing space - newArgs.append(" "); - } - - newArgs.append("-o %s"); - - return newArgs.toString(); - } - - protected Pair, List> executeTestWithoutAdditionalRRTests(final String name, final WalkerTestSpec spec) { - return super.executeTest(name, spec); - } - - private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) { - this.RRTest(testName, args, md5, useKnowns, emptyFileMd5); - } - - private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final String qualsTestMD5) { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; - WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); - executeTest(testName, spec, qualsTestMD5); - } - - @Test(enabled = true) - public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "0e503f7b79ace4c89d74f0943a0de1c0", false); - } - - @Test(enabled = true) - public void testDefaultCompressionWithKnowns() { - RRTest("testDefaultCompressionWithKnowns ", L, "6db7ce2733d006f8bd61c42a40d23728", true); - } - - private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; - - @Test(enabled = true) - public void testMultipleIntervals() { - RRTest("testMultipleIntervals ", intervals, "207f2c6d3db956e19412a45a231ca367", false, "043b2838c27d8f9580379b54c18ff40a"); - } - - @Test(enabled = true) - public void testMultipleIntervalsWithKnowns() { - RRTest("testMultipleIntervalsWithKnowns ", intervals, "f3b11a8a7673b301e27137936fafc6b6", true, "043b2838c27d8f9580379b54c18ff40a"); - } - - @Test(enabled = true) - public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "dcc3716b3665aa1c2dbe6b22d6534aef", false); - } - - @Test(enabled = true) - public void testHighCompressionWithKnowns() { - RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -minvar 0.3 -mindel 0.3 " + L, "97ae655bf0e483ea227b1aac67ced024", true); - } - - @Test(enabled = true) - public void testLowCompression() { - RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a1377eb922e0b09a03a280b691b0b3ff", false); - } - - @Test(enabled = true) - public void testLowCompressionWithKnowns() { - RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "bd7c5b0b210694f364ca6a41f5b89870", true); - } - - @Test(enabled = true) - public void testBadPvalueInput() { - final String cmd = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + "-o %s -min_pvalue -0.01"; - WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, UserException.BadArgumentValue.class); - executeTest("testBadPvalueInput", spec); - } - - @Test(enabled = true) - public void testIndelCompression() { - final String md5 = "9c9305eda5e4e7f22246ec8a4b242c97"; - RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); - RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); - } - - @Test(enabled = true) - public void testFilteredDeletionCompression() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("1bda512143be1016dfaca1f7020b6398")), "4f916da29d91852077f0a2fdbdd2c7f6"); - } - - @Test(enabled = true) - public void testCoReduction() { - String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("58c2bae5a339af2ea3c22a46ce8faa68"))); - } - - @Test(enabled = true) - public void testCoReductionWithKnowns() { - String base = String.format("-T ReduceReads %s --cancer_mode -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("5c251932b49d99a810581e3a6f762878"))); - } - - @Test(enabled = true) - public void testInsertionsAtEdgeOfConsensus() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("c10653a8c21fb32b5cf580d3704b0edd"))); - } - - /** - * Bug reported by Adam where a read that got clipped before actually belongs 2 intervals ahead - * and a subsequent tail leaves only this read in the stash. The next read to come in is in fact - * before (alignment start) than this read, so the TreeSet breaks with a Key out of Range error - * that was freaking hard to catch. - * - * This bam is simplified to replicate the exact bug with the three provided intervals. - */ - @Test(enabled = true) - public void testAddingReadAfterTailingTheStash() { - String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fddbec29d0945afbbb34b42994614c15")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); - } - - /** - * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get - * filtered out. - */ - @Test(enabled = true) - public void testDivideByZero() { - String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; - // we expect to lose coverage due to the downsampling so don't run the systematic tests - executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("7dfe2647992ce1154db340fc742d523a"))); - } - - /** - * Bug happens when reads are soft-clipped off the contig (usually in the MT). This test guarantees no changes to the upstream code will - * break the current hard-clipping routine that protects reduce reads from such reads. - */ - @Test(enabled = true) - public void testReadOffContig() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("595e5812c37189930cae93e45765def4"))); - } - - /** - * Confirm that if both ends of pair are in same variant region, compressed names of both ends of pair are the same. - */ - @Test(enabled = true) - public void testPairedReadsInVariantRegion() { - String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + - " -o %s --downsample_coverage 250 -dcov 50 "; - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b005727119eee27995705959a637085e")), "2af063d1bd3c322b03405dbb3ecf59a9"); - } - - /** - * Confirm that this bam does not fail when multi-sample mode is enabled. The provided example is tricky and used to cause - * us to exception out in the code. - */ - @Test(enabled = true) - public void testMultiSampleDoesNotFailWithFlag() { - String cmd = "-T ReduceReads --cancer_mode -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null"; - executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, Collections.emptyList())); - } - - /** - * Confirm that this bam fails when multi-sample mode is not enabled - */ - @Test(enabled = true) - public void testMultiSampleFailsWithoutFlag() { - String cmd = "-T ReduceReads -npt -R " + b37KGReference + " -I " + privateTestDir + "rr_multisample.bam -o /dev/null"; - executeTestWithoutAdditionalRRTests("testMultiSampleDoesNotFailWithFlag", new WalkerTestSpec(cmd, 0, UserException.BadInput.class)); - } - - /** - * Confirm that compression is not capping coverage counts to max byte - */ - @Test(enabled = true) - public void testCompressionWorksForHighDepth() { - final String base = String.format("-T ReduceReads -npt -R %s -I %s %s", b37KGReference, HIGH_COVERAGE_BAM, HIGH_COVERAGE_L) + " -o %s"; - final File outputBam = executeTestWithoutAdditionalRRTests("testCompressionWorksForHighDepth", - new WalkerTestSpec(base, 1, Arrays.asList(""))).first.get(0); // No MD5s; we only want to check the coverage - - boolean sawHighCoveragePosition = false; - final SAMFileReader reader = new SAMFileReader(outputBam); - reader.setSAMRecordFactory(new GATKSamRecordFactory()); - - for ( final SAMRecord rawRead : reader ) { - final GATKSAMRecord read = (GATKSAMRecord)rawRead; - read.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, rawRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); - - if ( ! read.isReducedRead() ) - continue; - - final int[] decodedCounts = read.getReducedReadCounts(); - for ( final int count : decodedCounts ) { - if ( count > Byte.MAX_VALUE ) { - sawHighCoveragePosition = true; - break; - } - } - - if ( sawHighCoveragePosition ) - break; - } - - reader.close(); - - Assert.assertTrue(sawHighCoveragePosition, "No positions were found with coverage over max byte (127); the coverage is incorrectly being capped somewhere!"); - } -} - diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java deleted file mode 100644 index 6032affa7..000000000 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java +++ /dev/null @@ -1,214 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.*; -import net.sf.samtools.SAMFileHeader; -import org.broad.tribble.Feature; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.variantcontext.Allele; -import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextBuilder; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Random; - - -public class ReduceReadsUnitTest extends BaseTest { - - Random random = new Random(987743); - Object2LongOpenHashMap hash = new Object2LongOpenHashMap(); - long nextNumber = 0L; - - /** - * Combinatorial unit test data provider example. - * - * Creates data for testMyData test function, containing two arguments, start and size at each value - * - * @return Object[][] for testng DataProvider - */ - @DataProvider(name = "ReadNameProvider") - public Object[][] readNameProvider() { - final int readNameLength = 4; - final int nReads = 100000; - final int charVariety = 20; - ObjectArrayList tests = new ObjectArrayList(); - ObjectOpenHashSet truthSet = new ObjectOpenHashSet(); - byte[] bytes = new byte[readNameLength]; - for ( int i = 0; i tests = new ObjectArrayList(); - - // test single - tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10))}); - - // test multiple at one position - tests.add(new Object[]{1, 1, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_10_2))}); - - // test multiple - tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); - - // test indel not used - tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_1_40))}); - tests.add(new Object[]{3, 3, read1, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(indel_2_40))}); - - // test read clears - tests.add(new Object[]{3, 0, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); - tests.add(new Object[]{4, 1, read2, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); - tests.add(new Object[]{3, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30))}); - tests.add(new Object[]{4, 0, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10))}); - tests.add(new Object[]{4, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_3_10))}); - tests.add(new Object[]{5, 1, read3, Arrays.asList(makeRefMetaDataTracker(snp_1_10), makeRefMetaDataTracker(snp_1_20), makeRefMetaDataTracker(snp_1_30), makeRefMetaDataTracker(snp_2_10), makeRefMetaDataTracker(snp_3_10))}); - - return tests.toArray(new Object[][]{}); - } - - private final RefMetaDataTracker makeRefMetaDataTracker(final Feature feature) { - final List x = new ArrayList(); - x.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, feature, "known")); - final RODRecordList rods = new RODRecordListImpl("known", x, genomeLocParser.createGenomeLoc(feature.getChr(), feature.getStart(), feature.getEnd())); - return new RefMetaDataTracker(Arrays.asList(rods)); - } - - @Test(dataProvider = "PopulateKnownsProvider") - public void testPopulateKnowns(final int expectedSizeBeforeClear, final int expectedSizeAfterClear, final GATKSAMRecord read, final List trackers) { - final ReduceReads rr = new ReduceReads(); - RodBinding.resetNameCounter(); - rr.known = Arrays.>asList(new RodBinding(VariantContext.class, "known")); - rr.knownSnpPositions = new ObjectAVLTreeSet(); - - final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); - engine.setGenomeLocParser(genomeLocParser); - rr.setToolkit(engine); - - for ( final RefMetaDataTracker tracker : trackers ) - rr.populateKnownSNPs(tracker); - Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeBeforeClear); - - rr.clearStaleKnownPositions(read); - Assert.assertEquals(rr.knownSnpPositions.size(), expectedSizeAfterClear); - } - -} \ No newline at end of file diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java deleted file mode 100644 index c49a671e2..000000000 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ /dev/null @@ -1,964 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.compression.reducereads; - -import it.unimi.dsi.fastutil.objects.*; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.UnvalidatingGenomeLoc; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -public class SlidingWindowUnitTest extends BaseTest { - - private static final int variantRegionLength = 1000; - private static final int globalStartPosition = 1000000; - - private static boolean[] createBitset(final List locs) { - final boolean[] variantRegionBitset = new boolean[variantRegionLength]; - for ( FinishedGenomeLoc loc : locs ) { - final int stop = loc.getStop() - globalStartPosition; - for ( int i = loc.getStart() - globalStartPosition; i <= stop; i++ ) - variantRegionBitset[i] = true; - } - return variantRegionBitset; - } - - ////////////////////////////////////////////////////////////////////////////////////// - //// Test for leading softclips immediately followed by an insertion in the CIGAR //// - ////////////////////////////////////////////////////////////////////////////////////// - - @Test(enabled = true) - public void testLeadingSoftClipThenInsertion() { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 10); - read.setReadBases(Utils.dupBytes((byte) 'A', 10)); - read.setBaseQualities(Utils.dupBytes((byte)30, 10)); - read.setMappingQuality(30); - read.setCigarString("2S2I6M"); - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 1); - slidingWindow.addRead(read); - slidingWindow.close(null); - } - - @Test(enabled = true) - public void testLeadingHardClipThenInsertion() { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 8); - read.setReadBases(Utils.dupBytes((byte) 'A', 8)); - read.setBaseQualities(Utils.dupBytes((byte)30, 8)); - read.setMappingQuality(30); - read.setCigarString("2H2I6M"); - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.addRead(read); - slidingWindow.close(null); - } - - ////////////////////////////////////////////////////////////////////////////////////// - //// This section tests the findVariantRegions() method and related functionality //// - ////////////////////////////////////////////////////////////////////////////////////// - - private static final FinishedGenomeLoc loc90to95 = new FinishedGenomeLoc("1", 0, 1000090, 1000095, false); - private static final FinishedGenomeLoc loc96to99 = new FinishedGenomeLoc("1", 0, 1000096, 1000099, false); - private static final FinishedGenomeLoc loc100to110 = new FinishedGenomeLoc("1", 0, 1000100, 1000110, false); - private static final FinishedGenomeLoc loc999 = new FinishedGenomeLoc("1", 0, 1000999, 1000999, false); - - private class FindVariantRegionsTest { - public List locs, expectedResult; - public boolean[] variantRegionBitset; - - private FindVariantRegionsTest(final List locs) { - this.locs = locs; - this.expectedResult = locs; - variantRegionBitset = createBitset(locs); - } - - private FindVariantRegionsTest(final List locs, final List expectedResult) { - this.locs = locs; - this.expectedResult = expectedResult; - variantRegionBitset = createBitset(locs); - } - } - - @DataProvider(name = "findVariantRegions") - public Object[][] createFindVariantRegionsData() { - List tests = new ArrayList(); - - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc100to110))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc96to99, loc100to110), Arrays.asList(new FinishedGenomeLoc("1", 0, 1000090, 1000110, false)))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc90to95, loc999))}); - tests.add(new Object[]{new FindVariantRegionsTest(Arrays.asList(loc999))}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "findVariantRegions", enabled = true) - public void testFindVariantRegions(FindVariantRegionsTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, test.variantRegionBitset, true); - int index = 0; - for ( final FinishedGenomeLoc loc : locs ) { - Assert.assertTrue(loc.equals(test.expectedResult.get(index++))); - } - } - - @Test(enabled = true) - public void testNoClosingRegions() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - final CompressionStash locs = slidingWindow.findVariantRegions(0, variantRegionLength, createBitset(Arrays.asList(loc90to95, loc999)), false); - Assert.assertEquals(locs.size(), 1); - Assert.assertEquals(locs.iterator().next(), loc90to95); - } - - - ///////////////////////////////////////////////////////////////////////////// - //// This section tests the markSites() method and related functionality //// - ///////////////////////////////////////////////////////////////////////////// - - @Test(enabled = true) - public void testMarkedSitesClass() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - final SlidingWindow.MarkedSites markedSites = slidingWindow.new MarkedSites(); - - markedSites.updateRegion(100, 100); - Assert.assertEquals(markedSites.getStartLocation(), 100); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - - markedSites.updateRegion(300, 100); - Assert.assertEquals(markedSites.getStartLocation(), 300); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - - markedSites.getVariantSiteBitSet()[10] = true; - markedSites.updateRegion(290, 100); - Assert.assertEquals(markedSites.getStartLocation(), 290); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - Assert.assertFalse(markedSites.getVariantSiteBitSet()[10]); - - markedSites.getVariantSiteBitSet()[20] = true; - markedSites.updateRegion(290, 100); - Assert.assertEquals(markedSites.getStartLocation(), 290); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - Assert.assertTrue(markedSites.getVariantSiteBitSet()[20]); - - markedSites.updateRegion(300, 100); - Assert.assertEquals(markedSites.getStartLocation(), 300); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 100); - - markedSites.getVariantSiteBitSet()[95] = true; - markedSites.updateRegion(390, 20); - Assert.assertEquals(markedSites.getStartLocation(), 390); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 20); - Assert.assertTrue(markedSites.getVariantSiteBitSet()[5]); - - markedSites.updateRegion(340, 60); - Assert.assertEquals(markedSites.getStartLocation(), 340); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 60); - - markedSites.getVariantSiteBitSet()[20] = true; - markedSites.updateRegion(350, 60); - Assert.assertEquals(markedSites.getStartLocation(), 350); - Assert.assertEquals(markedSites.getVariantSiteBitSet().length, 60); - Assert.assertTrue(markedSites.getVariantSiteBitSet()[10]); - } - - @Test(enabled = true) - public void testMarkVariantRegion() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - slidingWindow.getMarkedSitesForTesting().updateRegion(100, 100); - - slidingWindow.markVariantRegion(40); - Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 21); - - slidingWindow.markVariantRegion(5); - Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 37); - - slidingWindow.markVariantRegion(95); - Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 52); - } - - private static int countTrueBits(final boolean[] bitset) { - int count = 0; - for ( final boolean bit : bitset ) { - if ( bit ) - count++; - } - return count; - } - - @Test(enabled = true) - public void testMarkingRegionInCancerMode() { - - final int contextSize = 10; - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, contextSize, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.addRead(createSimpleRead("1", 0, 34, 75)); - slidingWindow.addRead(createSimpleRead("2", 0, 97, 73)); - slidingWindow.addRead(createSimpleRead("3", 0, 98, 75)); - slidingWindow.addRead(createSimpleRead("4", 0, 98, 75)); - slidingWindow.addRead(createSimpleRead("5", 0, 98, 75)); - - final CompressionStash regions = new CompressionStash(); - regions.add(new FinishedGenomeLoc("1", 0, 89, 109, true)); - - slidingWindow.closeVariantRegions(regions, null, false); - Assert.assertEquals(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet().length, 76 + contextSize); - } - - private GATKSAMRecord createSimpleRead(final String name, final int refIndex, final int alignmentStart, final int length) { - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length); - read.setReadBases(Utils.dupBytes((byte) 'A', length)); - read.setBaseQualities(Utils.dupBytes((byte) 30, length)); - read.setMappingQuality(60); - return read; - } - - - ///////////////////////////////////////////////////////////////// - //// This section tests the consensus creation functionality //// - ///////////////////////////////////////////////////////////////// - - private static final int readLength = 100; - private static final int testRegionSize = 1000; - private final ObjectList basicReads = new ObjectArrayList(20); - private IndexedFastaSequenceFile seq; - private SAMFileHeader header; - - @BeforeClass - public void setup() throws FileNotFoundException { - seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); - header = ArtificialSAMUtils.createArtificialSamHeader(seq.getSequenceDictionary()); - - final int readFrequency = 20; - - basicReads.clear(); - for ( int i = 0; i < testRegionSize; i += readFrequency ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition + i, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(i % 40 == 20); - basicReads.add(read); - } - } - - private class ConsensusCreationTest { - public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage; - public final List myReads = new ArrayList(20); - public final String description; - - private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) { - this.expectedNumberOfReads = expectedNumberOfReads; - this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; - this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage; - this.description = String.format("%d %d %d %b %b", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage, readsShouldBeLowQuality, variantBaseShouldBeLowQuality); - - // first, add the basic reads to the collection - myReads.addAll(basicReads); - - // then add the permuted reads - for ( final GenomeLoc loc : locs ) - myReads.add(createVariantRead(loc, readsShouldBeLowQuality, variantBaseShouldBeLowQuality, CigarOperator.M)); - } - - private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression, final int expectedNumberOfReadsAtDeepCoverage) { - this.expectedNumberOfReads = expectedNumberOfReads; - this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; - this.expectedNumberOfReadsAtDeepCoverage = expectedNumberOfReadsAtDeepCoverage; - this.description = String.format("%s %d %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression, expectedNumberOfReadsAtDeepCoverage); - - // first, add the basic reads to the collection - myReads.addAll(basicReads); - - // then add the permuted reads - for ( final GenomeLoc loc : locs ) - myReads.add(createVariantRead(loc, false, false, operator)); - } - - public String toString() { return description; } - - private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality, - final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) { - - final int startPos = loc.getStart() - 50; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "myRead" + startPos, 0, startPos, readLength); - - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - // create a mismatch if requested - if ( operator == CigarOperator.M ) - bases[50] = 'C'; - read.setReadBases(bases); - - final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength); - if ( variantBaseShouldBeLowQuality ) - baseQuals[50] = (byte)10; - read.setBaseQualities(baseQuals); - final byte mappingQual = readShouldBeLowQuality ? (byte)10 : (byte)30; - read.setMappingQuality(mappingQual); - - if ( operator != CigarOperator.M ) { - final List elements = new ArrayList(3); - elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 51, CigarOperator.M)); - elements.add(new CigarElement(1, operator)); - elements.add(new CigarElement(operator == CigarOperator.D ? 50 : 48, CigarOperator.M)); - read.setCigar(new Cigar(elements)); - } - - return read; - } - } - - private static final GenomeLoc loc290 = new UnvalidatingGenomeLoc("1", 0, 1000290, 1000290); - private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295); - private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309); - private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310); - private static final GenomeLoc loc320 = new UnvalidatingGenomeLoc("1", 0, 1000320, 1000320); - private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100); - - private static final int DEEP_COVERAGE_ITERATIONS = 100; - - @DataProvider(name = "ConsensusCreation") - public Object[][] createConsensusCreationTestData() { - List tests = new ArrayList(); - - // test high quality reads and bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, false, 11, 8, 7 + DEEP_COVERAGE_ITERATIONS)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc320), false, false, 13, 12, 6 + (6 * DEEP_COVERAGE_ITERATIONS))}); - - // test low quality reads - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), true, false, 3, 3, 3)}); - - // test low quality bases - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, true, 2, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, true, 2, 2, 2)}); - - // test mixture - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 3, 3, 3)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 2, 2, 2)}); - - // test I/D operators - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.D, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.D, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.I, 11, 11, 4 + (7 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), CigarOperator.I, 12, 12, 4 + (8 * DEEP_COVERAGE_ITERATIONS))}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), CigarOperator.I, 13, 13, 4 + (9 * DEEP_COVERAGE_ITERATIONS))}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ConsensusCreation", enabled = true) - public void testConsensusCreationTest(ConsensusCreationTest test) { - final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); - - // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); - - // test WITH het compression at KNOWN sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - for ( int i = 0; i < 1200; i++ ) - knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); - result = slidingWindow.close(knownSNPs); - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); - - // test WITH het compression at ALL sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - result = slidingWindow.close(null); - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); - - // test with deep coverage - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 0, ReduceReads.DownsampleStrategy.Normal, false); - for ( int i = 0; i < DEEP_COVERAGE_ITERATIONS; i++ ) { - for ( final GATKSAMRecord read : test.myReads ) { - final GATKSAMRecord copy = ArtificialSAMUtils.createArtificialRead(header, read.getReadName() + "_" + (i+1), 0, read.getAlignmentStart(), readLength); - copy.setReadBases(read.getReadBases()); - copy.setBaseQualities(read.getBaseQualities()); - copy.setMappingQuality(read.getMappingQuality()); - copy.setReadNegativeStrandFlag(read.getReadNegativeStrandFlag()); - if ( read.getCigar() != null ) - copy.setCigar(read.getCigar()); - slidingWindow.addRead(copy); - } - } - result = slidingWindow.close(null); - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsAtDeepCoverage); - } - - @Test - public void testConsensusCreationForMultiallelic() { - - final int totalNumReads = 7; - final ObjectList myReads = new ObjectArrayList(totalNumReads); - - for ( int i = 0; i < totalNumReads; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - - final char base = i < totalNumReads - 2 ? 'A' : ( i == totalNumReads - 2 ? 'C' : 'G'); - read.setReadBases(Utils.dupBytes((byte) base, readLength)); - - myReads.add(read); - } - - final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); - - // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - - // test WITH het compression at KNOWN sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - for ( int i = 0; i < readLength; i++ ) - knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); - result = slidingWindow.close(knownSNPs); - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - - // test WITH het compression at ALL sites - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - result = slidingWindow.close(knownSNPs); - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - } - - @Test - public void testConsensusCreationForInsertions() { - - final int totalNumReads = 7; - final ObjectList myReads = new ObjectArrayList<>(totalNumReads); - - // add reads, one with a SNP and one with a SNP and insertion - for ( int i = 0; i < totalNumReads; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, readLength); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - if ( i < 2 ) - bases[20] = 'C'; - if ( i == 0 ) - bases[80] = 'C'; - read.setReadBases(bases); - - if ( i == 0 ) - read.setCigarString("80M1I19M"); - - myReads.add(read); - } - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - final Pair, CompressionStash> result = slidingWindow.close(null); - Assert.assertEquals(result.getFirst().size(), 3); // no compression at all for SNPs - } - - @Test - public void testAddingReadPairWithSameCoordinates() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); - - final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1); - read1.setReadBases(new byte[]{(byte)'A'}); - read1.setBaseQualities(new byte[]{(byte)'A'}); - read1.setMappingQuality(30); - read1.setReadNegativeStrandFlag(false); - slidingWindow.addRead(read1); - - final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, globalStartPosition, 1); - read2.setReadBases(new byte[]{(byte)'A'}); - read2.setBaseQualities(new byte[]{(byte)'A'}); - read2.setMappingQuality(30); - read2.setReadNegativeStrandFlag(true); - slidingWindow.addRead(read2); - - Assert.assertEquals(slidingWindow.readsInWindow.size(), 2); - } - - @Test - public void testOnlySpanningReadHasLowQual() { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - - final GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header, "basicRead1", 0, globalStartPosition, 100); - final GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header, "basicRead2", 0, globalStartPosition + 50, 100); - - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - read1.setReadBases(bases); - read2.setReadBases(bases); - - final byte[] baseQuals = Utils.dupBytes((byte) 30, readLength); - baseQuals[80] = (byte)10; - read1.setBaseQualities(baseQuals); - read2.setBaseQualities(baseQuals); - - read1.setMappingQuality(30); - read2.setMappingQuality(30); - - slidingWindow.addRead(read1); - slidingWindow.addRead(read2); - - Assert.assertEquals(slidingWindow.close(null).getFirst().size(), 1); - } - - - /////////////////////////////////////////////////////////// - //// This section tests the downsampling functionality //// - /////////////////////////////////////////////////////////// - - @DataProvider(name = "Downsampling") - public Object[][] createDownsamplingTestData() { - List tests = new ArrayList(); - - for ( int i = 1; i < basicReads.size() + 10; i++ ) - tests.add(new Object[]{i}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "Downsampling", enabled = true) - public void testDownsamplingTest(final int dcov) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); - final ObjectList result = slidingWindow.downsampleVariantRegion(basicReads); - - Assert.assertEquals(result.size(), Math.min(dcov, basicReads.size())); - } - - @DataProvider(name = "DownsamplingFromClose") - public Object[][] createDownsamplingFromCloseTestData() { - - final ObjectList myReads = new ObjectArrayList<>(20); - for ( int i = 0; i < 21; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - if ( i < 5 ) - bases[50] = 'C'; - read.setReadBases(bases); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - myReads.add(read); - } - - List tests = new ArrayList<>(); - - for ( int i = 1; i < 25; i++ ) - tests.add(new Object[]{myReads, i}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "DownsamplingFromClose", enabled = true) - public void testDownsamplingTestFromClose(final ObjectList myReads, final int dcov) { - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); // no het compression - - Assert.assertEquals(result.getFirst().size(), Math.min(dcov, myReads.size()), "Down-sampling was not performed correctly"); - } - - @DataProvider(name = "NoDownsamplingForConsensusReads") - public Object[][] createNoDownsamplingForConsensusReadsData() { - - final ObjectList myReads = new ObjectArrayList<>(20); - for ( int i = 0; i < 30; i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read" + i, 0, globalStartPosition, readLength); - final byte[] bases = Utils.dupBytes((byte) 'A', readLength); - if ( i < 10 ) - bases[50] = 'C'; - read.setReadBases(bases); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - read.setReadNegativeStrandFlag(i % 2 == 0); - myReads.add(read); - } - - List tests = new ArrayList<>(); - - for ( int i = 0; i < 5; i++ ) - tests.add(new Object[]{myReads, i}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "NoDownsamplingForConsensusReads", enabled = true) - public void testNoDownsamplingForConsensusReads(final ObjectList myReads, final int dcov) { - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, dcov, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : myReads ) - slidingWindow.addRead(read); - Pair, CompressionStash> result = slidingWindow.close(null); // allow het compression (so we expect 4 reads) - - Assert.assertEquals(result.getFirst().size(), 4, "Down-sampling was performed on consensus reads!"); - } - - ////////////////////////////////////////////////////////////// - //// This section tests the consensus base quals accuracy //// - ////////////////////////////////////////////////////////////// - - private class QualsTest { - public final List quals; - public final List myReads = new ArrayList(5); - - private QualsTest(final List quals) { - this.quals = quals; - for ( int i = 0; i < quals.size(); i++ ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead" + i, 0, globalStartPosition, 1); - read.setReadBases(new byte[]{(byte)'A'}); - read.setBaseQualities(new byte[]{quals.get(i).byteValue()}); - read.setMappingQuality(30); - myReads.add(read); - } - } - } - - @DataProvider(name = "ConsensusQuals") - public Object[][] createConsensusQualsData() { - List tests = new ArrayList(); - - final int[] quals = new int[]{ 0, 5, 10, 15, 20, 30, 40, 50 }; - - for ( final int qual1 : quals ) { - for ( final int qual2 : quals ) { - for ( final int qual3 : quals ) { - tests.add(new Object[]{new QualsTest(Arrays.asList(qual1, qual2, qual3))}); - } - } - } - - return tests.toArray(new Object[][]{}); - } - - private static final byte minUsableConsensusQual = 10; - - @Test(dataProvider = "ConsensusQuals", enabled = true) - public void testConsensusQualsTest(QualsTest test) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, minUsableConsensusQual, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); - for ( final GATKSAMRecord read : test.myReads ) - slidingWindow.addRead(read); - final Pair, CompressionStash> result = slidingWindow.close(new ObjectAVLTreeSet()); - - Assert.assertEquals(result.getFirst().size(), 1); - final GATKSAMRecord read = result.getFirst().iterator().next(); - final int actualBaseQual = read.getReducedCount(0) * read.getBaseQualities()[0]; - final int expectedBaseQual = qualSum(test.quals); - Assert.assertEquals(actualBaseQual, expectedBaseQual); - } - - private static int qualSum(final List quals) { - int goodBases = 0; - int sum = 0; - for ( final int qual : quals ) { - if ( qual >= minUsableConsensusQual ) { - goodBases++; - sum += qual; - } - } - - // handle a low quality consensus - if ( sum == 0 ) { - for ( final int qual : quals ) { - goodBases++; - sum += qual; - } - } - - return sum - (sum % goodBases); - } - - - //////////////////////////////////////////////////// - //// This section tests the new header creation //// - //////////////////////////////////////////////////// - - @DataProvider(name = "CreateNewHeader") - public Object[][] CreateNewHeaderTestData() { - List tests = new ArrayList(); - - for ( final int start : Arrays.asList(-10, -1, 0, 1, 10) ) { - for ( final int stop : Arrays.asList(-10, -1, 0, 1, 10) ) { - tests.add(new Object[]{start, stop}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "CreateNewHeader", enabled = true) - public void createNewHeaderTest(final int start, final int stop) { - - // set up the window header - final int currentHeaderStart = 100; - final int currentHeaderLength = 50; - final LinkedList windowHeader = new LinkedList(); - for ( int i = 0; i < currentHeaderLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the read - final int readStart = currentHeaderStart + start; - final int readLength = currentHeaderLength + stop - start; - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, readStart, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte) 30, readLength)); - read.setMappingQuality(30); - - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - int newIndex = slidingWindow.createNewHeaderElements(windowHeader, read, start); - - Assert.assertEquals(newIndex, start > 0 ? start : 0); - - final int expectedNewLength = currentHeaderLength + (start < 0 ? -start : 0) + (stop > 0 ? stop : 0); - Assert.assertEquals(windowHeader.size(), expectedNewLength); - } - - - //////////////////////////////////////////////////////////// - //// This section tests updating the header from a read //// - //////////////////////////////////////////////////////////// - - @DataProvider(name = "UpdateHeaderForRead") - public Object[][] UpdateHeaderForReadTestData() { - List tests = new ArrayList(); - - for ( final int start : Arrays.asList(0, 1, 10) ) { - for ( final int readLength : Arrays.asList(1, 5, 10) ) { - tests.add(new Object[]{start, readLength}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "UpdateHeaderForRead", enabled = true) - public void updateHeaderForReadTest(final int start, final int readLength) { - - // set up the window header - final int currentHeaderStart = 100; - final int currentHeaderLength = 50; - final LinkedList windowHeader = new LinkedList(); - for ( int i = 0; i < currentHeaderLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the read - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + start, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(30); - read.setReadNegativeStrandFlag(false); - - // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, start); - for ( int i = 0; i < start; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); - for ( int i = 0; i < readLength; i++ ) - Assert.assertEquals(windowHeader.get(start + i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1); - for ( int i = start + readLength; i < currentHeaderLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); - - // now remove the read - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, true, start); - for ( int i = 0; i < currentHeaderLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 0); - } - - @Test - public void testUpdateHeaderForReadWithHighMQ() { - - // set up the window header - final int currentHeaderStart = 100; - final LinkedList windowHeader = new LinkedList<>(); - for ( int i = 0; i < readLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the read - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, readLength); - read.setReadBases(Utils.dupBytes((byte) 'A', readLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, readLength)); - read.setMappingQuality(180); - read.setReadNegativeStrandFlag(false); - - // add the read and make sure it's not filtered because of low MQ (byte vs. int) - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); - for ( int i = 0; i < readLength; i++ ) - Assert.assertEquals(windowHeader.get(i).getBaseCounts(SlidingWindow.ConsensusType.POSITIVE_CONSENSUS).countOfBase(BaseUtils.Base.A.base), 1); - } - - ////////////////////////////////////////////////////////////////////////////////// - //// This section tests functionality related to polyploid consensus creation //// - ////////////////////////////////////////////////////////////////////////////////// - - @DataProvider(name = "MatchesKnownProvider") - public Object[][] matchesKnownProvider() { - - final ObjectArrayList tests = new ObjectArrayList(); - - // test no knowns - tests.add(new Object[]{new ObjectAVLTreeSet(), loc290.getStart(), false}); - - final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); - knownSnpPositions.add(loc290); - knownSnpPositions.add(loc295); - knownSnpPositions.add(loc310); - - // test overlap - tests.add(new Object[]{knownSnpPositions, loc290.getStart(), true}); - tests.add(new Object[]{knownSnpPositions, loc295.getStart(), true}); - tests.add(new Object[]{knownSnpPositions, loc310.getStart(), true}); - tests.add(new Object[]{knownSnpPositions, loc309.getStart(), false}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "MatchesKnownProvider") - public void testMatchesKnown(final ObjectSortedSet knownSnpPositions, final int targetLoc, final boolean expectedResult) { - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10); - Assert.assertEquals(slidingWindow.matchesKnownPosition(targetLoc, knownSnpPositions), expectedResult); - } - - @DataProvider(name = "SignificantSoftclipsProvider") - public Object[][] SignificantSoftclipsTestData() { - List tests = new ArrayList(); - - for ( final int indexWithSoftclips : Arrays.asList(-1, 0, 5, 9) ) { - for ( final int indexToSkip : Arrays.asList(-1, 0, 5, 9) ) { - tests.add(new Object[]{indexWithSoftclips, indexToSkip}); - } - } - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "SignificantSoftclipsProvider", enabled = true) - public void significantSoftclipsTest(final int indexWithSoftclips, final int indexToSkip) { - - // set up the window header - final int currentHeaderStart = 100; - final int currentHeaderLength = 10; - final LinkedList windowHeader = new LinkedList(); - for ( int i = 0; i < currentHeaderLength; i++ ) - windowHeader.add(new HeaderElement(currentHeaderStart + i)); - - // set up the normal read - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart, currentHeaderLength); - read.setReadBases(Utils.dupBytes((byte) 'A', currentHeaderLength)); - read.setBaseQualities(Utils.dupBytes((byte)30, currentHeaderLength)); - read.setMappingQuality(30); - - // add the read - final SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 0.05, 20, 20, 10, ReduceReads.DownsampleStrategy.Normal, false); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, read, false, 0); - - // set up and add a soft-clipped read if requested - if ( indexWithSoftclips != -1 ) { - final GATKSAMRecord softclippedRead = ArtificialSAMUtils.createArtificialRead(header, "basicRead", 0, currentHeaderStart + indexWithSoftclips, 1); - softclippedRead.setReadBases(new byte[]{(byte) 'A'}); - softclippedRead.setBaseQualities(new byte[]{(byte) 30}); - softclippedRead.setMappingQuality(30); - softclippedRead.setCigarString("1S"); - slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips); - } - - final boolean result = slidingWindow.hasPositionWithSignificantSoftclipsOrVariant(windowHeader, currentHeaderStart + indexToSkip); - Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip); - } -} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java deleted file mode 100644 index fd1f0de8a..000000000 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/BiasedDownsamplingIntegrationTest.java +++ /dev/null @@ -1,162 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.collections.Pair; -import org.junit.Assert; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.io.File; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Random; - -public class BiasedDownsamplingIntegrationTest extends WalkerTest { - - private final static String baseCommandUG = "-T UnifiedGenotyper -R " + hg19Reference + " --no_cmdline_in_header -glm BOTH -L 20:4,000,000-5,000,000"; - private final static String baseCommandHC = "-T HaplotypeCaller -R " + hg19Reference + " --no_cmdline_in_header -L 20:4,000,000-5,000,000" + " --useFilteredReadsForAnnotations"; - - private final String ArtificalBAMLocation = privateTestDir + "ArtificallyContaminatedBams/"; - - - // -------------------------------------------------------------------------------------------------------------- - // - // testing UnifiedGenotyper contamination down-sampling on BAMs with artificially created contaminated. - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - private void testDefaultContamination() { - final String bam1 = "NA11918.with.1.NA12842.reduced.bam"; - final String bam2 = "NA12842.with.1.NA11918.reduced.bam"; - - WalkerTestSpec spec = new WalkerTestSpec( - baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s -contamination .05 ", 1, - Arrays.asList("b13612312ff991cf40ddc44255e76ecd")); - executeTest("test contamination on Artificial Contamination (flat) on " + bam1 + " and " + bam2 + " with .05 downsampling.", spec); - } - - - // verify that inputing a file with an effectively flat contamination level is equivalent to handing in a flat contamination level - - - @DataProvider(name="PerSampleEqualFlatContamBams") - public Object[][] makePerSampleEqualFlatContamBams() { - final List tests = new LinkedList(); - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0}) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ; - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "PerSampleEqualFlatContamBams") - private void testPerSampleEqualsFlat(final String bam1, final String bam2, final String persampleFile, final Double downsampling) { - final String command = baseCommandUG + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; - - WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList("")); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - Pair, List> test1 = executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); - - spec = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList("")); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - Pair, List> test2 = executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); - - //verify that the md5s match up. - Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0)); - } - - // -------------------------------------------------------------------------------------------------------------- - // - // testing HaplotypeCaller Contamination Removal - // - // -------------------------------------------------------------------------------------------------------------- - - - - @DataProvider(name="PerSampleEqualFlatContamBamsHC") - public Object[][] makePerSampleEqualFlatContamBamsHC() { - final List tests = new LinkedList(); - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.6.txt", 0.0 }) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.7.txt", 0.15}) ; - tests.add(new Object[]{"NA11918.with.2.NA12842.reduced.bam", "NA12842.with.1.NA11918.reduced.bam", ArtificalBAMLocation + "contamination.case.8.txt", 0.3}) ; - - return tests.toArray(new Object[][]{}); - } - - - @Test(dataProvider = "PerSampleEqualFlatContamBamsHC") - private void testPerSampleEqualsFlatHC(final String bam1, final String bam2, final String persampleFile, final Double downsampling) { - final String command = baseCommandHC + " -I " + ArtificalBAMLocation + bam1 + " -I " + ArtificalBAMLocation + bam2 + " -o %s "; - - WalkerTestSpec spec = new WalkerTestSpec( command +" -contaminationFile " + persampleFile, 1, Arrays.asList("")); - final Random rnd = GenomeAnalysisEngine.getRandomGenerator(); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - - Pair, List> test1= executeTest("test contamination on Artificial Contamination, with per-sample file on " + bam1 + " and " + bam2 + " with " + persampleFile, spec); - - WalkerTestSpec spec2 = new WalkerTestSpec(command + "-contamination " + downsampling.toString(), 1, Arrays.asList("")); - - rnd.setSeed(123451); // so that the two test cases have a hope of giving the same result - Pair, List> test2=executeTest("test contamination on Artificial Contamination, with flat contamination on " + bam1 + " and " + bam2 + " with " + downsampling.toString(), spec); - - //verify that the md5s match up. - Assert.assertEquals(test1.getSecond().get(0),test2.getSecond().get(0)); - - } - - - -} \ No newline at end of file diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java deleted file mode 100644 index df749231e..000000000 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperReducedReadsIntegrationTest.java +++ /dev/null @@ -1,87 +0,0 @@ -/* -* By downloading the PROGRAM you agree to the following terms of use: -* -* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* -* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* -* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and -* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. -* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* -* 1. DEFINITIONS -* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* -* 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. -* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. -* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY -* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. -* Copyright 2012 Broad Institute, Inc. -* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. -* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* -* 4. INDEMNIFICATION -* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* -* 5. NO REPRESENTATIONS OR WARRANTIES -* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. -* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* -* 6. ASSIGNMENT -* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* -* 7. MISCELLANEOUS -* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. -* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. -* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. -* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. -* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. -*/ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; - -import java.util.Arrays; - -public class UnifiedGenotyperReducedReadsIntegrationTest extends WalkerTest { - - // -------------------------------------------------------------------------------------------------------------- - // - // testing reduced reads - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void testReducedBam() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("ffde0d5e23523e4bd9e7e18f62d37d0f")); - executeTest("test calling on a ReducedRead BAM", spec); - } - - @Test - public void testReducedBamSNPs() { - testReducedCalling("SNP", "cc0508b18028f2e84e6a42c1ff23721c"); - } - - @Test - public void testReducedBamINDELs() { - testReducedCalling("INDEL", "6fc00d5299b1bf334d39634c3409a69d"); - } - - - private void testReducedCalling(final String model, final String md5) { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper --contamination_fraction_to_filter 0.05 --disableDithering -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-10,500,000 -glm " + model, 1, - Arrays.asList(md5)); - executeTest("test calling on a ReducedRead BAM with " + model, spec); - } -} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java index af66d7f88..664afda51 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/AssemblyResultSetUnitTest.java @@ -171,7 +171,7 @@ public class AssemblyResultSetUnitTest extends BaseTest final ReadThreadingGraph rtg = new ReadThreadingGraph(10); for (final Haplotype h : haplotypes) - rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), null, h.isReference()); + rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), h.isReference()); final SeqGraph seqGraph = rtg.convertToSequenceGraph(); final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION,seqGraph); ar.setThreadingGraph(rtg); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 615c62c43..c95a3a839 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -217,28 +217,6 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { } - // -------------------------------------------------------------------------------------------------------------- - // - // testing reduced reads - // - // -------------------------------------------------------------------------------------------------------------- - - @Test - public void HCTestReducedBam() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("12c56262ed30db1249b8d722e324357c")); - executeTest("HC calling on a ReducedRead BAM", spec); - } - - @Test - public void testReducedBamWithReadsNotFullySpanningDeletion() { - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T HaplotypeCaller --contamination_fraction_to_filter 0.05 --disableDithering --pcr_indel_model NONE -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("1627cf5f3a97e8b73b3c095db46aef1b")); - executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); - } - // -------------------------------------------------------------------------------------------------------------- // // test dbSNP annotation diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java index 7d218c19c..309fd2549 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/ReferenceConfidenceModelUnitTest.java @@ -157,26 +157,6 @@ public class ReferenceConfidenceModelUnitTest extends BaseTest { } } - @Test - public void testCalcNIndelInformativeReducedReads() { - final String bases = "ACGGGTTTGGAC"; - final byte[] quals = Utils.dupBytes((byte)30, bases.length()); - final int count = 10; - final int[] counts = new int[bases.length()]; - for ( int i = 0; i < counts.length; i++ ) - counts[i] = count; - final int position = 100; - - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, position, counts.length, counts); - read.setReadString(bases); - read.setBaseQualities(quals); - read.setCigarString(bases.length() + "M"); - final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, position, position); - final ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, Collections.singletonList(read), 0); - final int actual = model.calcNIndelInformativeReads(pileup, 0, bases.getBytes(), 3); - Assert.assertEquals(actual, count); - } - @Test public void testClose() { model.close(); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index 0d9c07251..f6e2a106f 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -79,8 +79,8 @@ public class ReadThreadingGraphUnitTest extends BaseTest { final ReadThreadingGraph assembler = new ReadThreadingGraph(11); final String ref = "CATGCACTTTAAAACTTGCCTTTTTAACAAGACTTCCAGATG"; final String alt = "CATGCACTTTAAAACTTGCCGTTTTAACAAGACTTCCAGATG"; - assembler.addSequence("anonymous", getBytes(ref), null, true); - assembler.addSequence("anonymous", getBytes(alt), null, false); + assembler.addSequence("anonymous", getBytes(ref), true); + assembler.addSequence("anonymous", getBytes(alt), false); assembler.buildGraphIfNecessary(); Assert.assertNotEquals(ref.length() - 11 + 1,assembler.vertexSet().size(),"the number of vertex in the graph is the same as if there was no alternative sequence"); Assert.assertEquals(ref.length() - 11 + 1 + 11,assembler.vertexSet().size(),"the number of vertex in the graph is not the same as if there is an alternative sequence"); @@ -178,7 +178,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest { // test that there are cycles detected for small kmer final ReadThreadingGraph rtgraph25 = new ReadThreadingGraph(25); - rtgraph25.addSequence("ref", ref.getBytes(), null, true); + rtgraph25.addSequence("ref", ref.getBytes(), true); for ( final GATKSAMRecord read : reads ) rtgraph25.addRead(read); rtgraph25.buildGraphIfNecessary(); @@ -186,7 +186,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest { // test that there are no cycles detected for large kmer final ReadThreadingGraph rtgraph75 = new ReadThreadingGraph(75); - rtgraph75.addSequence("ref", ref.getBytes(), null, true); + rtgraph75.addSequence("ref", ref.getBytes(), true); for ( final GATKSAMRecord read : reads ) rtgraph75.addRead(read); rtgraph75.buildGraphIfNecessary(); @@ -200,7 +200,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest { final byte[] ref = Utils.dupBytes((byte)'A', length); final ReadThreadingGraph rtgraph = new ReadThreadingGraph(25); - rtgraph.addSequence("ref", ref, null, true); + rtgraph.addSequence("ref", ref, true); // add reads with Ns at any position for ( int i = 0; i < length; i++ ) { @@ -250,7 +250,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest { // create the graph and populate it final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); - rtgraph.addSequence("ref", ref.getBytes(), null, true); + rtgraph.addSequence("ref", ref.getBytes(), true); final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); rtgraph.addRead(read); rtgraph.buildGraphIfNecessary(); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java index 7c3160c30..fe381513e 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/SequenceForKmersUnitTest.java @@ -55,26 +55,12 @@ public class SequenceForKmersUnitTest extends BaseTest { @Test public void testNoCount() { final byte[] seq = "ACGT".getBytes(); - final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, null, true); + final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, 1, true); Assert.assertEquals(sk.name, "foo"); Assert.assertEquals(sk.sequence, seq); Assert.assertEquals(sk.start, 0); Assert.assertEquals(sk.stop, seq.length); + Assert.assertEquals(sk.count, 1); Assert.assertEquals(sk.isRef, true); - for ( int i = 0; i < seq.length; i++ ) - Assert.assertEquals(sk.getCount(i), 1); - } - - @Test - public void testWithCounts() { - final int len = 256; - final int[] counts = new int[len]; - for ( int i = 0; i < len; i++ ) counts[i] = i; - final byte[] seq = Utils.dupBytes((byte)'A', len); - - final SequenceForKmers sk = new SequenceForKmers("foo", seq, 0, seq.length, counts, true); - - for ( int i = 0; i < seq.length; i++ ) - Assert.assertEquals(sk.getCount(i), i); } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java index 84b995749..1e5417227 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/utils/pairhmm/ActiveRegionTestDataSet.java @@ -116,9 +116,9 @@ public class ActiveRegionTestDataSet { public AssemblyResultSet assemblyResultSet() { if (assemblyResultSet == null) { final ReadThreadingGraph rtg = new ReadThreadingGraph(kmerSize); - rtg.addSequence("anonymous", this.getReference().getBytes(), null, true); + rtg.addSequence("anonymous", this.getReference().getBytes(), true); for (final String haplotype : this.haplotypesStrings()) { - rtg.addSequence("anonymous", haplotype.getBytes(), null, false); + rtg.addSequence("anonymous", haplotype.getBytes(), false); } rtg.buildGraphIfNecessary(); if (rtg.hasCycles()) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 9dc9734a5..4f680ffc3 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -31,7 +31,6 @@ import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -48,10 +47,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; -import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; -import java.io.FileNotFoundException; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.*; @@ -327,6 +324,8 @@ public class SAMDataSource { // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { SAMFileReader reader = readers.getReader(id); + checkForReducedBamFile(reader.getFileHeader()); + ReadGroupMapping mappingToMerged = new ReadGroupMapping(); List readGroups = reader.getFileHeader().getReadGroups(); @@ -352,6 +351,16 @@ public class SAMDataSource { resourcePool.releaseReaders(readers); } + /** + * Checks whether the provided SAM header if from a reduced bam file. + * @param header the SAM header for a given file + * @throws UserException if the header is from a reduced bam + */ + private void checkForReducedBamFile(final SAMFileHeader header) { + if ( header.getProgramRecord("GATK ReduceReads") != null ) + throw new UserException("The GATK no longer supports running off of BAMs produced by ReduceReads"); + } + public void close() { SAMReaders readers = resourcePool.getAvailableReaders(); for(SAMReaderID readerID: readerIDs) { diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index fb7a16bfd..56c370276 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -65,18 +65,14 @@ public class AlleleBiasedDownsamplingUtils { alleleStratifiedElements[i] = new PileupElementList(); // start by stratifying the reads by the alleles they represent at this position - boolean sawReducedRead = false; for ( final PileupElement pe : pileup ) { - if ( pe.getRead().isReducedRead() ) - sawReducedRead = true; - final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); if ( baseIndex != -1 ) alleleStratifiedElements[baseIndex].add(pe); } // make a listing of allele counts and calculate the total count - final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements, sawReducedRead); + final int[] alleleCounts = calculateAlleleCounts(alleleStratifiedElements); final int totalAlleleCount = (int)MathUtils.sum(alleleCounts); // do smart down-sampling @@ -106,18 +102,12 @@ public class AlleleBiasedDownsamplingUtils { * Calculates actual allele counts for each allele (which can be different than the list size when reduced reads are present) * * @param alleleStratifiedElements pileup elements stratified by allele - * @param sawReducedRead is at least one read a reduced read? * @return non-null int array representing allele counts */ - private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements, final boolean sawReducedRead) { + private static int[] calculateAlleleCounts(final PileupElementList[] alleleStratifiedElements) { final int[] alleleCounts = new int[alleleStratifiedElements.length]; for ( int i = 0; i < alleleStratifiedElements.length; i++ ) { - if ( !sawReducedRead ) { - alleleCounts[i] = alleleStratifiedElements[i].size(); - } else { - for ( final PileupElement pe : alleleStratifiedElements[i] ) - alleleCounts[i] += pe.getRepresentativeCount(); - } + alleleCounts[i] = alleleStratifiedElements[i].size(); } return alleleCounts; } @@ -211,24 +201,7 @@ public class AlleleBiasedDownsamplingUtils { int currentBitSetIndex = 0; for ( final PileupElement element : elements ) { - - final int representativeCount = element.getRepresentativeCount(); - - // if it's a reduced read, we need to be smart about how we down-sample - if ( representativeCount > 1 ) { - // count how many bits are set over the span represented by this read - int setBits = 0; - for ( int i = 0; i < representativeCount; i++ ) - setBits += itemsToRemove.get(currentBitSetIndex++) ? 1 : 0; - - // remove that count from the count of the reduced read - if ( setBits == representativeCount ) - elementsToRemove.add(element); - else - element.adjustRepresentativeCount(-1 * setBits); - } - // otherwise it's trivial: remove if the corresponding bit is set - else if ( itemsToRemove.get(currentBitSetIndex++) ) { + if ( itemsToRemove.get(currentBitSetIndex++) ) { elementsToRemove.add(element); } } @@ -255,7 +228,6 @@ public class AlleleBiasedDownsamplingUtils { alleles.remove(Allele.NO_CALL); // ignore the no-call bin final int numAlleles = alleles.size(); - // TODO -- if we ever decide to make this work for reduced reads, this will need to use the representative counts instead final int[] alleleCounts = new int[numAlleles]; for ( int i = 0; i < numAlleles; i++ ) alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); @@ -302,9 +274,6 @@ public class AlleleBiasedDownsamplingUtils { int currentBitSetIndex = 0; for ( final GATKSAMRecord read : reads ) { - if ( read.isReducedRead() ) - throw new IllegalStateException("Allele-biased downsampling of reduced reads has not been implemented for a list of GATKSAMRecords"); - if ( itemsToRemove.get(currentBitSetIndex++) ) elementsToRemove.add(read); } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java index 466ade1ed..7b42f75f9 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -25,9 +25,6 @@ package org.broadinstitute.sting.gatk.downsampling; -import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - import java.util.Collection; import java.util.List; @@ -159,14 +156,6 @@ public abstract class Downsampler { * @return true if the item should not be subject to elimination during downsampling, otherwise false */ protected boolean doNotDiscardItem( final Object item ) { - // Use getClass() rather than instanceof for performance reasons. Ugly but fast. - if ( item.getClass() == GATKSAMRecord.class ) { - return ((GATKSAMRecord)item).isReducedRead(); - } - else if ( item.getClass() == AlignmentStateMachine.class ) { - return ((AlignmentStateMachine)item).isReducedRead(); - } - return false; } } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java index e30965925..2e38f5daa 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java @@ -314,14 +314,13 @@ public class CallableLoci extends LocusWalker= minMappingQuality && (e.getQual() >= minBaseQuality || e.isDeletion())) { - QCDepth += depth; + QCDepth++; } } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index f0d6f7301..0d61af305 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -217,12 +217,12 @@ public class CoverageUtils { private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.Base.D.ordinal()] += e.getRepresentativeCount(); + counts[BaseUtils.Base.D.ordinal()]++; } else if ( BaseUtils.basesAreEqual(BaseUtils.Base.N.base, e.getBase()) ) { - counts[BaseUtils.Base.N.ordinal()] += e.getRepresentativeCount(); + counts[BaseUtils.Base.N.ordinal()]++; } else { try { - counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); + counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++; } catch (ArrayIndexOutOfBoundsException exc) { throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase()); } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java index 9823e524a..f867f76c2 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/DeprecatedToolChecks.java @@ -42,6 +42,7 @@ public class DeprecatedToolChecks { private static Object2ObjectMap deprecatedGATKWalkers = new Object2ObjectOpenHashMap(); static { // Indicate recommended replacement in parentheses if applicable + deprecatedGATKWalkers.put("ReduceReads", "3.0 (use recommended best practices pipeline with the HaplotypeCaller)"); deprecatedGATKWalkers.put("CountCovariates", "2.0 (use BaseRecalibrator instead; see documentation for usage)"); deprecatedGATKWalkers.put("TableRecalibration", "2.0 (use PrintReads with -BQSR instead; see documentation for usage)"); deprecatedGATKWalkers.put("AlignmentWalker", "2.2 (no replacement)"); diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 7d5823ae0..fd04dbc21 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -399,12 +399,6 @@ public class ClippingOp { hardClippedRead.setBaseQualities(newBaseDeletionQuals, EventType.BASE_DELETION); } - if (read.isReducedRead()) { - final int[] reducedCounts = new int[newLength]; - System.arraycopy(read.getReducedReadCounts(), copyStart, reducedCounts, 0, newLength); - hardClippedRead.setReducedReadCounts(reducedCounts); - } - return hardClippedRead; } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java index b7cd03919..49ec6f20a 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -32,7 +32,6 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; import java.util.*; @@ -115,13 +114,9 @@ public class PerReadAlleleLikelihoodMap { alleleReadMap.put(allele, new ArrayList()); for ( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { - // TODO -- come up with a strategy for down-sampling reduced reads - // Currently we are unable to remove reduced reads because their representative base count differs throughout the read - if ( !entry.getKey().isReducedRead() ) { - final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); - if ( bestAllele.isInformative() ) - alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); - } + final MostLikelyAllele bestAllele = getMostLikelyAllele(entry.getValue()); + if ( bestAllele.isInformative() ) + alleleReadMap.get(bestAllele.getMostLikelyAllele()).add(entry.getKey()); } return alleleReadMap; @@ -233,10 +228,9 @@ public class PerReadAlleleLikelihoodMap { for( final Map.Entry> entry : likelihoodReadMap.entrySet() ) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) final GATKSAMRecord read = entry.getKey(); - final int count = ReadUtils.getMeanRepresentativeReadCount(read); final double likelihood_iii = entry.getValue().get(iii_allele); final double likelihood_jjj = entry.getValue().get(jjj_allele); - haplotypeLikelihood += count * (MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF); + haplotypeLikelihood += MathUtils.approximateLog10SumLog10(likelihood_iii, likelihood_jjj) + MathUtils.LOG_ONE_HALF; // fast exit. If this diploid pair is already worse than the max, just stop and look at the next pair if ( haplotypeLikelihood < maxElement ) break; diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java index 86f3500be..c4b566582 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/locusiterator/AlignmentStateMachine.java @@ -123,15 +123,6 @@ public class AlignmentStateMachine { return getRead().getReferenceIndex(); } - /** - * Is our read a reduced read? - * - * @return true if the read we encapsulate is a reduced read, otherwise false - */ - public boolean isReducedRead() { - return read.isReducedRead(); - } - /** * Is this the left edge state? I.e., one that is before or after the current read? * @return true if this state is an edge state, false otherwise diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java index 8a034dde0..42cfc9492 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -30,8 +30,6 @@ import com.google.java.contract.Requires; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; @@ -296,43 +294,6 @@ public class PileupElement implements Comparable { // // -------------------------------------------------------------------------- - /** - * Returns the number of elements in the pileup element. - * - * Unless this is a reduced read, the number of elements in a pileup element is one. In the event of - * this being a reduced read and a deletion, we return the average number of elements between the left - * and right elements to the deletion. We assume the deletion to be left aligned. - * - * @return the representative count - */ - public int getRepresentativeCount() { - if (read.isReducedRead()) { - if (isDeletion() && (offset + 1 >= read.getReadLength()) ) // deletion in the end of the read - throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s", read.getReadName(), read.getCigarString())); - - return isDeletion() - ? MathUtils.fastRound((read.getReducedCount(offset) + read.getReducedCount(offset + 1)) / 2.0) - : read.getReducedCount(offset); - } else { - return 1; - } - } - - /** - * Adjusts the representative count of this pileup element. - * Throws an exception if this element does not represent a reduced read. - * - * See GATKSAMRecord.adjustReducedCount() for warnings on the permanency of this operation. - * - * @param adjustmentFactor how much to adjust the representative count (can be positive or negative) - */ - public void adjustRepresentativeCount(final int adjustmentFactor) { - if ( read.isReducedRead() ) - read.adjustReducedCount(offset, adjustmentFactor); - else - throw new IllegalArgumentException("Trying to adjust the representative count of a read that is not reduced"); - } - /** * Get the cigar element aligning this element to the genome * @return a non-null CigarElement diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index 455a6aa12..6ccf74e4e 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -811,10 +811,7 @@ public class ReadBackedPileupImpl implements ReadBackedPileup { @Override public int depthOfCoverage() { if (depthOfCoverage == UNINITIALIZED_CACHED_INT_VALUE) { - depthOfCoverage = 0; - for (PileupElement p : pileupElementTracker.unorderedIterable()) { - depthOfCoverage += p.getRepresentativeCount(); - } + depthOfCoverage = pileupElementTracker.size(); } return depthOfCoverage; } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 055f8630b..b8367a7df 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -324,31 +324,6 @@ public class ArtificialSAMUtils { return Arrays.asList(left, right); } - /** - * Create an artificial reduced read based on the parameters. The cigar string will be *M, where * is the - * length of the read. The base counts specified in the baseCounts array will be stored fully encoded in - * the RR attribute. - * - * @param header the SAM header to associate the read with - * @param name the name of the read - * @param refIndex the reference index, i.e. what chromosome to associate it with - * @param alignmentStart where to start the alignment - * @param length the length of the read - * @param baseCounts reduced base counts to encode in the RR attribute; length must match the read length - * @return the artificial reduced read - */ - public static GATKSAMRecord createArtificialReducedRead( final SAMFileHeader header, - final String name, - final int refIndex, - final int alignmentStart, - final int length, - final int[] baseCounts ) { - final GATKSAMRecord read = createArtificialRead(header, name, refIndex, alignmentStart, length); - read.setReducedReadCounts(baseCounts); - read.setReducedReadCountsTag(); - return read; - } - /** * Create a collection of identical artificial reads based on the parameters. The cigar string for each * read will be *M, where * is the length of the read. diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 93718b04d..52e6e1c25 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import net.sf.samtools.*; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.recalibration.EventType; @@ -51,12 +50,6 @@ import java.util.*; * functions, so modifying a GATKSAMRecord in any way may result in stale cached values. */ public class GATKSAMRecord extends BAMRecord { - // ReduceReads specific attribute tags - public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool - public static final String REDUCED_READ_STRANDED_TAG = "RS"; // marks a stranded synthetic read produced by the ReduceReads tool - public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start - public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end - // Base Quality Score Recalibrator specific attribute tags public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions @@ -70,17 +63,15 @@ public class GATKSAMRecord extends BAMRecord { // the SAMRecord data we're caching private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; - private int[] reducedReadCounts = null; private final static int UNINITIALIZED = -1; private int softStart = UNINITIALIZED; private int softEnd = UNINITIALIZED; private Integer adapterBoundary = null; - private Boolean isStrandlessRead = null; + private boolean isStrandlessRead = false; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; - private boolean retrievedReduceReadCounts = false; // These temporary attributes were added here to make life easier for // certain algorithms by providing a way to label or attach arbitrary data to @@ -160,9 +151,6 @@ public class GATKSAMRecord extends BAMRecord { * @return true if this read doesn't have meaningful strand information */ public boolean isStrandless() { - if ( isStrandlessRead == null ) { - isStrandlessRead = isReducedRead() && getCharacterAttribute(REDUCED_READ_STRANDED_TAG) == null; - } return isStrandlessRead; } @@ -342,185 +330,6 @@ public class GATKSAMRecord extends BAMRecord { return getReadGroup().getNGSPlatform(); } - /////////////////////////////////////////////////////////////////////////////// - // *** ReduceReads functions ***// - /////////////////////////////////////////////////////////////////////////////// - - /** - * Get the counts of the bases in this reduced read - * - * NOTE that this is not the value of the REDUCED_READ_CONSENSUS_TAG, which - * is encoded in a special way. This is the actual positive counts of the - * depth at each bases. So for a RR with a tag of: - * - * [10, 5, -1, -5] - * - * this function returns - * - * [10, 15, 9, 5] - * - * as one might expect. - * - * @return a int[] holding the depth of the bases in this reduced read, or null if this isn't a reduced read - */ - public int[] getReducedReadCounts() { - if ( ! retrievedReduceReadCounts ) { - final byte[] tag = getByteArrayAttribute(REDUCED_READ_CONSENSUS_TAG); - if ( tag != null ) reducedReadCounts = decodeReduceReadCounts(tag); - retrievedReduceReadCounts = true; - } - - return reducedReadCounts; - } - - /** - * The number of bases corresponding the i'th base of the reduced read. - * - * @param i the read based coordinate inside the read - * @return the number of bases corresponding to the i'th base of the reduced read - */ - public final int getReducedCount(final int i) { - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to retrieve the reduced count from a read that is not reduced"); - if ( i < 0 || i >= getReadBases().length ) - throw new IllegalArgumentException("illegal offset used when retrieving reduced counts: " + i); - - final int[] reducedCounts = getReducedReadCounts(); - return reducedCounts[i]; - } - - /** - * Is this read a reduced read? - * @return true if yes - */ - public boolean isReducedRead() { - return getReducedReadCounts() != null; - } - - /** - * Set the reduced read counts tag for this record. - * Note that this method is slightly expensive as it converts to the correct reduced counts representation and sets the - * appropriate binary tag. If you want to modify the reduced count in place without triggering the permanent conversion - * internally, use the #setReducedCount() method. - * - * @param counts the count array - */ - public void setReducedReadCountsTag(final int[] counts) { - setAttribute(REDUCED_READ_CONSENSUS_TAG, encodeReduceReadCounts(counts)); - retrievedReduceReadCounts = false; // need to force new decode in case we had to handle precision problems with the counts - } - - /** - * @see #setReducedReadCountsTag() and uses the currently stored values of the internal array. - * Useful if you've been using #setReducedCount() to modify the reduced count and now want to trigger the expensive conversion. - */ - public void setReducedReadCountsTag() { - if ( !retrievedReduceReadCounts ) - throw new IllegalStateException("Trying to write the reduced reads counts using an uninitialized internal array of counts"); - setReducedReadCountsTag(reducedReadCounts); - } - - /** - * Sets the reduced read count corresponding the i'th base of the reduced read. - * - * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion - * and push that value into the read's binary tags, use #setReducedReadCountsTag(). - * - * @param i the read based coordinate inside the read - * @param count the new count - */ - public final void setReducedCount(final int i, final int count) { - if ( count < 0 ) - throw new IllegalArgumentException("the reduced count cannot be set to a negative value"); - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); - if ( i < 0 || i >= getReadBases().length ) - throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); - - // force the initialization of the counts array if it hasn't happened yet - getReducedReadCounts()[i] = count; - } - - /** - * Set the reduced read counts tag for this record to counts - * - * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion - * and push that value into the read's binary tags, use #setReducedReadCountsTag(). - * - * @param counts the count array - */ - public void setReducedReadCounts(final int[] counts) { - if ( counts.length != getReadBases().length ) - throw new IllegalArgumentException("Reduced counts length " + counts.length + " != bases length " + getReadBases().length); - retrievedReduceReadCounts = true; - reducedReadCounts = counts; - } - - /** - * Sets the number of bases corresponding the i'th base of the reduced read. - * - * WARNING: does not actually write this value permanently to the binary tags for this read. To trigger the conversion - * and push that value into the read's binary tags, use #setReducedReadCountsTag(). - * - * @param i the read based coordinate inside the read - * @param adjustmentFactor how much to add/subtract to the current count - */ - public final void adjustReducedCount(final int i, final int adjustmentFactor) { - if ( !isReducedRead() ) - throw new IllegalArgumentException("error trying to set the reduced count for a read that is not reduced"); - if ( i < 0 || i >= getReadBases().length ) - throw new IllegalArgumentException("illegal offset used when setting the reduced count: " + i); - - setReducedCount(i, getReducedReadCounts()[i] + adjustmentFactor); - } - - /** - * Actually decode the consensus tag of a reduce read, returning a newly allocated - * set of values countsFromTag to be the real depth of cover at each base of the reduced read. - * - * for example, if the tag contains [10, 5, -1, -5], after running this function the - * byte[] will contain the true counts [10, 15, 9, 5]. - * - * as one might expect. - * - * @param countsFromTag a non-null byte[] containing the tag encoded reduce reads counts - * @return a non-null int[] containing the true depth values for the vector - */ - protected static int[] decodeReduceReadCounts(final byte[] countsFromTag) { - final int n = countsFromTag.length; - final int[] result = new int[n]; - final int firstCount = countsFromTag[0] & 0xff; // unsigned byte - result[0] = firstCount; - for ( int i = 1; i < n; i++ ) { - final int offsetCount = countsFromTag[i] & 0xff; // unsigned byte - result[i] = (firstCount + offsetCount) % 256; - } - - return result; - } - - /** - * Converts int array from straight counts to the appropriate reduce reads representation in BAM (offset from first value) - * - * @param counts the counts array - * @return non-null converted byte array - */ - protected static byte[] encodeReduceReadCounts(final int[] counts) { - if ( counts.length == 0 ) - throw new IllegalArgumentException("Trying to write a reduced read with a counts array of length 0"); - - final byte[] compressedCountsArray = new byte[counts.length]; - final int firstCount = (int) MathUtils.bound(counts[0], 0, 255); // we want an unsigned byte capped at max byte representation - compressedCountsArray[0] = (byte)firstCount; - for ( int i = 1; i < counts.length; i++ ) { - final int count = (int) MathUtils.bound(counts[i], 0, 255); - final byte offset = (byte)(count - firstCount + (count >= firstCount ? 0 : 256)); // unsigned byte - compressedCountsArray[i] = offset; - } - - return compressedCountsArray; - } - /////////////////////////////////////////////////////////////////////////////// // *** GATKSAMRecord specific methods ***// /////////////////////////////////////////////////////////////////////////////// @@ -682,11 +491,7 @@ public class GATKSAMRecord extends BAMRecord { * @return the alignment start of a read before it was clipped */ public int getOriginalAlignmentStart() { - int originalAlignmentStart = getUnclippedStart(); - Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT); - if (alignmentShift != null) - originalAlignmentStart += alignmentShift; - return originalAlignmentStart; + return getUnclippedStart(); } /** @@ -697,11 +502,7 @@ public class GATKSAMRecord extends BAMRecord { * @return the alignment end of a read before it was clipped */ public int getOriginalAlignmentEnd() { - int originalAlignmentEnd = getUnclippedEnd(); - Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT); - if (alignmentShift != null) - originalAlignmentEnd -= alignmentShift; - return originalAlignmentEnd; + return getUnclippedEnd(); } /** @@ -735,7 +536,6 @@ public class GATKSAMRecord extends BAMRecord { emptyRead.setCigarString(""); emptyRead.setReadBases(new byte[0]); emptyRead.setBaseQualities(new byte[0]); - if ( read.isReducedRead() ) emptyRead.setReducedReadCounts(new int[0]); SAMReadGroupRecord samRG = read.getReadGroup(); emptyRead.clearAttributes(); diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java index a512e9ca9..2b6654bcd 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -57,15 +57,6 @@ public class ReadUtils { private static final int DEFAULT_ADAPTOR_SIZE = 100; public static final int CLIPPING_GOAL_NOT_REACHED = -1; - public static int getMeanRepresentativeReadCount(GATKSAMRecord read) { - if (!read.isReducedRead()) - return 1; - - // compute mean representative read counts - final int[] counts = read.getReducedReadCounts(); - return (int)Math.round((double)MathUtils.sum(counts)/counts.length); - } - /** * A marker to tell which end of the read has been clipped */ @@ -695,8 +686,7 @@ public class ReadUtils { case D: for (int i = 0; i < cigarElement.getLength(); i++) { if (refLocation >= startLocation && refLocation <= stopLocation) { - int baseCount = read.isReducedRead() ? read.getReducedCount(refLocation - read.getSoftStart()) : 1; - coverage[refLocation - startLocation] += baseCount; // this may be a reduced read, so add the proper number of bases + coverage[refLocation - startLocation]++; } refLocation++; } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 52285fb2e..0c53de307 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -210,4 +210,21 @@ public class SAMDataSourceUnitTest extends BaseTest { List doRemoveProgramRecords = data.getHeader().getProgramRecords(); assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); } + + @Test(expectedExceptions = UserException.class) + public void testFailOnReducedReads() { + readers.add(new SAMReaderID(new File(privateTestDir + "old.reduced.bam"), new Tags())); + + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + } } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java index 6e908a3bf..c587d5e08 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -124,16 +124,10 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - for ( final int originalNormalCount : Arrays.asList(0, 1, 2, 10, 1000) ) { - for ( final int originalReducedCount : Arrays.asList(0, 1, 2, 10, 100) ) { - for ( final int indexToPutReducedRead : Arrays.asList(0, 2, originalNormalCount) ) { - if ( originalReducedCount == 0 || indexToPutReducedRead > originalNormalCount ) - continue; - for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { - if ( toRemove <= originalNormalCount + originalReducedCount ) - tests.add(new Object[]{header, originalNormalCount, originalReducedCount, indexToPutReducedRead, toRemove}); - } - } + for ( final int originalCount : Arrays.asList(1, 2, 10, 1000) ) { + for ( final int toRemove : Arrays.asList(0, 1, 2, 10, 1000) ) { + if ( toRemove <= originalCount ) + tests.add(new Object[]{header, originalCount, toRemove}); } } @@ -141,27 +135,17 @@ public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { } @Test(dataProvider = "BiasedDownsamplingTest") - public void testBiasedDownsampling(final SAMFileHeader header, final int originalNormalCount, final int originalReducedCount, final int indexToPutReducedRead, final int toRemove) { + public void testBiasedDownsampling(final SAMFileHeader header, final int originalCount, final int toRemove) { - final LinkedList elements = new LinkedList(); - for ( int i = 0; i < originalNormalCount; i++ ) { + final LinkedList elements = new LinkedList<>(); + for ( int i = 0; i < originalCount; i++ ) { final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); elements.add(new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); } - if ( originalReducedCount > 0 ) { - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, 1, 1); - read.setReducedReadCountsTag(new int[]{originalReducedCount}); - elements.add(indexToPutReducedRead, new PileupElement(read, 0, new CigarElement(1, CigarOperator.M), 0, 0)); - } - final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalNormalCount + originalReducedCount, toRemove); - int pileupCount = 0; - for ( final PileupElement pe : elements ) // reduced reads may have gotten modified - pileupCount += pe.getRepresentativeCount(); - for ( final PileupElement pe : result ) - pileupCount -= pe.getRepresentativeCount(); + final List result = AlleleBiasedDownsamplingUtils.downsampleElements(elements, originalCount, toRemove); - Assert.assertEquals(pileupCount, originalNormalCount + originalReducedCount - toRemove); + Assert.assertEquals(result.size(), toRemove); } @Test diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java index 8f0eee069..4fd9e491c 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; @@ -156,36 +155,4 @@ public class FractionalDownsamplerUnitTest extends BaseTest { downsampler.resetStats(); Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final ReadsDownsampler downsampler = new FractionalDownsampler(0.0); - - final Collection reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int i = 1; i <= 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts)); - } - for ( int i = 1; i <= 5; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 5, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 10, "downsampler size() reports wrong number of items"); - - final Collection readsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(readsReturned.size(), 10, "wrong number of items returned by the downsampler"); - - for ( GATKSAMRecord readReturned : readsReturned ) { - Assert.assertTrue(readReturned.isReducedRead(), "non-reduced read survived the downsampling process, but shouldn't have"); - } - } } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java index 8cf0fd2a1..07a8a7975 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -25,12 +25,8 @@ package org.broadinstitute.sting.gatk.downsampling; -import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.Test; import org.testng.annotations.DataProvider; import org.testng.Assert; @@ -164,41 +160,4 @@ public class LevelingDownsamplerUnitTest extends BaseTest { Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final Downsampler> downsampler = new LevelingDownsampler, AlignmentStateMachine>(1); - - final Collection> groups = new LinkedList>(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int alignmentStart : Arrays.asList(1, 2, 3) ) { - final LinkedList group = new LinkedList(); - for ( int i = 1; i <= 10; i++ ) { - group.add(new AlignmentStateMachine(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts))); - } - groups.add(group); - } - - downsampler.submit(groups); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 30, "downsampler size() reports wrong number of items"); - - final Collection> groupsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(groupsReturned.size(), 3, "wrong number of groups returned by the downsampler"); - - for ( LinkedList group : groupsReturned ) { - Assert.assertEquals(group.size(), 10, "group has wrong size after downsampling"); - - for ( AlignmentStateMachine state : group ) { - Assert.assertTrue(state.isReducedRead()); - } - } - } } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java index a50201efd..66abfd29b 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -30,7 +30,6 @@ import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; @@ -129,46 +128,4 @@ public class ReservoirDownsamplerUnitTest extends BaseTest { downsampler.resetStats(); Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final ReadsDownsampler downsampler = new ReservoirDownsampler(1); - - final Collection reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int i = 1; i <= 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, 1, 5, baseCounts)); - } - for ( int i = 1; i <= 5; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 1, 5)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 4, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 11, "downsampler size() reports wrong number of items"); - - final Collection readsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(readsReturned.size(), 11, "wrong number of items returned by the downsampler"); - - int numReducedReadsReturned = 0; - int numNormalReadsReturned = 0; - for ( GATKSAMRecord readReturned : readsReturned ) { - if ( readReturned.isReducedRead() ) { - numReducedReadsReturned++; - } - else { - numNormalReadsReturned++; - } - } - - Assert.assertEquals(numReducedReadsReturned, 10, "wrong number of reduced reads returned by the downsampler"); - Assert.assertEquals(numNormalReadsReturned, 1, "wrong number of non-reduced reads returned by the downsampler"); - } } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java index bec0030d0..afe8729c2 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -328,48 +328,4 @@ public class SimplePositionalDownsamplerUnitTest extends BaseTest { Assert.assertEquals(downsampledReads.size(), 10); } - - @Test - public void testDoNotDiscardReducedReads() { - GenomeAnalysisEngine.resetRandomGenerator(); - final ReadsDownsampler downsampler = new SimplePositionalDownsampler(1); - - final Collection reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int alignmentStart : Arrays.asList(1, 2, 3) ) { - for ( int i = 1; i <= 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, alignmentStart, 5, baseCounts)); - } - for ( int i = 1; i <= 5; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, alignmentStart, 5)); - } - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - - Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 12, "wrong number of items discarded by the downsampler"); - Assert.assertTrue(downsampler.hasFinalizedItems(), "downsampler should have finalized items but doesn't"); - Assert.assertEquals(downsampler.size(), 33, "downsampler size() reports wrong number of items"); - - final Collection readsReturned = downsampler.consumeFinalizedItems(); - - Assert.assertEquals(readsReturned.size(), 33, "wrong number of items returned by the downsampler"); - - int numReducedReadsReturned = 0; - int numNormalReadsReturned = 0; - for ( GATKSAMRecord readReturned : readsReturned ) { - if ( readReturned.isReducedRead() ) { - numReducedReadsReturned++; - } - else { - numNormalReadsReturned++; - } - } - - Assert.assertEquals(numReducedReadsReturned, 30, "wrong number of reduced reads returned by the downsampler"); - Assert.assertEquals(numNormalReadsReturned, 3, "wrong number of non-reduced reads returned by the downsampler"); - } } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java index 4d85997b3..62f4bdc88 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/traversals/TAROrderedReadCacheUnitTest.java @@ -26,11 +26,9 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialBAMBuilder; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.BeforeClass; @@ -41,7 +39,6 @@ import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collection; import java.util.List; public class TAROrderedReadCacheUnitTest extends BaseTest { @@ -104,47 +101,6 @@ public class TAROrderedReadCacheUnitTest extends BaseTest { verifySortednessOfReads(cacheReads); } - @Test - public void testReadCacheWithReducedReads() { - final List reads = new ArrayList(); - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - final int[] baseCounts = { 10, 10, 10, 10, 10 }; - - for ( int i = 1; i <= 100; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialReducedRead(header, "foo", 0, i, 5, baseCounts)); - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, i, 5)); - } - - final TAROrderedReadCache cache = new TAROrderedReadCache(50); - - cache.addAll(reads); - - // Our cache should have kept all of the reduced reads (which are retained unconditionally and do not count - // towards the capacity limit), and discarded half of the 100 non-reduced reads due to the cache capacity - // limit of 50. - Assert.assertEquals(cache.size(), 150, "wrong number of reads in the cache at the end"); - Assert.assertEquals(cache.getNumDiscarded(), 50, "wrong number of reads discarded from the cache"); - - final List cacheReads = cache.popCurrentReads(); - - int numReducedReadsRetained = 0; - int numNormalReadsRetained = 0; - - for ( GATKSAMRecord read : cacheReads ) { - if ( read.isReducedRead() ) { - numReducedReadsRetained++; - } - else { - numNormalReadsRetained++; - } - } - - Assert.assertEquals(numReducedReadsRetained, 100, "wrong number of reduced reads retained in the cache"); - Assert.assertEquals(numNormalReadsRetained, 50, "wrong number of non-reduced reads retained in the cache"); - - verifySortednessOfReads(cacheReads); - } - private void verifySortednessOfReads( final List reads) { int lastStart = -1; for ( GATKSAMRecord read : reads ) { diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java index 336c15ccc..5392e8037 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociIntegrationTest.java @@ -67,13 +67,4 @@ public class CallableLociIntegrationTest extends WalkerTest { Arrays.asList("7f79ad8195c4161060463eeb21d2bb11", "7ee269e5f4581a924529a356cc806e55")); executeTest("formatBed lots of arguments", spec); } - - @Test(enabled=true) - public void testWithReducedRead() { - String gatk_args = reduceReadArgs + " -L 20:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, - Arrays.asList("69fc303c888fd1fa2937b9518dc82f9e", "f512a85c373087ce03a24ab0f98522c0")); - executeTest("CallableLoci with ReducedRead", spec); - } - } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java index cc008404c..cd12c3b9b 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/clipping/ReadClipperUnitTest.java @@ -411,22 +411,6 @@ public class ReadClipperUnitTest extends BaseTest { } - @Test(enabled = !DEBUG) - public void testHardClipReducedRead() { - GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("10M"); - final int[] counts = new int[read.getReadLength()]; - for ( int i = 0; i < counts.length; i++ ) counts[i] = i; - read.setReducedReadCounts(counts); - int alnStart = read.getAlignmentStart(); - int alnEnd = read.getAlignmentEnd(); - int readLength = read.getReadLength(); - for (int i = 0; i < readLength / 2; i++) { - GATKSAMRecord clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, alnStart + i, alnEnd - i); - final int[] expectedReducedCounts = Arrays.copyOfRange(counts, i + 1, readLength - i - 1); - Assert.assertEquals(clippedRead.getReducedReadCounts(), expectedReducedCounts); - } - } - @Test(enabled = !DEBUG) public void testRevertEntirelySoftclippedReads() { GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar("2H1S3H"); diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java index 888ab7f7f..39058233e 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/pileup/PileupElementUnitTest.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.locusiterator.AlignmentStateMachine; import org.broadinstitute.sting.utils.locusiterator.LIBS_position; @@ -126,7 +125,6 @@ public class PileupElementUnitTest extends LocusIteratorByStateBaseTest { // TODO -- add meaningful tests pe.getBaseInsertionQual(); pe.getBaseDeletionQual(); - pe.getRepresentativeCount(); } } diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index e9af685a6..837f3fa45 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -27,22 +27,15 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.testng.Assert; import org.testng.annotations.BeforeClass; -import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.List; - public class GATKSAMRecordUnitTest extends BaseTest { - GATKSAMRecord read, reducedRead; + GATKSAMRecord read; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; - final private static int[] REDUCED_READ_COUNTS = new int[]{10, 20, 30, 40}; @BeforeClass public void init() { @@ -51,121 +44,6 @@ public class GATKSAMRecordUnitTest extends BaseTest { read.setReadUnmappedFlag(true); read.setReadBases(new String(BASES).getBytes()); read.setBaseQualityString(new String(QUALS)); - - reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); - reducedRead.setReadBases(BASES.getBytes()); - reducedRead.setBaseQualityString(QUALS); - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - } - - @Test - public void testReducedReads() { - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - - Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); - Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); - - Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); - for (int i = 0; i < reducedRead.getReadLength(); i++) { - Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); - } - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testGetReducedCountOnNormalRead() { - read.getReducedCount(0); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testSetReducedTagOnNormalRead() { - read.setReducedCount(0, 2); - } - - @Test(expectedExceptions = IllegalArgumentException.class) - public void testAdjustReducedCountToNegativeNumber() { - reducedRead.setReducedCount(0, 1); - reducedRead.adjustReducedCount(0, -2); - } - - @Test - public void testSetReducedCountOnReducedRead() { - for (int i = 0; i < reducedRead.getReadLength(); i++) { - final byte newCount = (byte)i; - reducedRead.setReducedCount(i, newCount); - Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); - } - - for (int i = 0; i < reducedRead.getReadLength(); i++) { - final int newCount = reducedRead.getReducedCount(i) + i; - reducedRead.adjustReducedCount(i, i); - Assert.assertEquals(reducedRead.getReducedCount(i), newCount, "Reduced read count not set to the expected value at " + i); - } - } - - @Test - public void testReducedReadEncodeAndDecode() { - - // encode - byte[] encoded = GATKSAMRecord.encodeReduceReadCounts(REDUCED_READ_COUNTS); - - // decode - int[] decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); - - // for the heck of it, let's encode and decode again! - encoded = GATKSAMRecord.encodeReduceReadCounts(decoded); - decoded = GATKSAMRecord.decodeReduceReadCounts(encoded); - - for (int i = 0; i < decoded.length; i++) - Assert.assertEquals(decoded[i], REDUCED_READ_COUNTS[i]); - } - - @Test - public void testByteBoundsOnReducedTag() { - reducedRead.setReducedCount(0, 1000); - reducedRead.setReducedReadCountsTag(); - reducedRead.adjustReducedCount(0, -255); - Assert.assertEquals(reducedRead.getReducedCount(0), 0); - } - - @Test - public void testReducedReadPileupElement() { - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - - PileupElement readp = LocusIteratorByState.createPileupForReadAndOffset(read, 0); - PileupElement reducedreadp = LocusIteratorByState.createPileupForReadAndOffset(reducedRead, 0); - - Assert.assertFalse(readp.getRead().isReducedRead()); - - Assert.assertTrue(reducedreadp.getRead().isReducedRead()); - Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); - Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); - } - - @Test - public void testGetOriginalAlignments() { - final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; - final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; - GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); - - // A regular read with all matches - Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); - - // Alignment start shifted - int alignmentShift = 2; - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, alignmentShift); - Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); - - // Both alignments shifted - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, alignmentShift); - Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); - - // Alignment end shifted - read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null); - Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); - Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); } @Test @@ -197,36 +75,4 @@ public class GATKSAMRecordUnitTest extends BaseTest { read.setIsStrandless(true); read.setReadNegativeStrandFlag(true); } - - @Test - public void testGetReducedCountsIsCorrect() { - reducedRead.setReducedReadCountsTag(REDUCED_READ_COUNTS); - final int[] counts = reducedRead.getReducedReadCounts(); - Assert.assertNotSame(counts, reducedRead.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG)); - for ( int i = 0; i < counts.length; i++ ) - Assert.assertEquals(counts[i], reducedRead.getReducedCount(i), "Reduced counts vector not equal to getReducedCount(i) at " + i); - } - - @DataProvider(name = "ReducedReadCountConversionProvider") - public Object[][] ReducedReadCountConversionTestData() { - List tests = new ArrayList(); - - tests.add(new Object[]{new int[] {100, 100, 100, 101}, new byte[] {100, 0, 0, 1}}); - tests.add(new Object[]{new int[] {1, 100, 100, 0}, new byte[] {1, 99, 99, -1}}); - tests.add(new Object[]{new int[] {127, 100, 0, 1}, new byte[] {127, -27, -127, -126}}); - tests.add(new Object[]{new int[] {1, 127, 51, 126}, new byte[] {1, 126, 50, 125}}); - tests.add(new Object[]{new int[] {300, 127, 1, 255}, new byte[] {-1, -128, 2, 0}}); - tests.add(new Object[]{new int[] {1, 300, 51, 126}, new byte[] {1, -2, 50, 125}}); - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "ReducedReadCountConversionProvider", enabled = true) - public void reducedReadCountConversionTest(final int[] counts, final byte[] expectedConversion) { - - reducedRead.setReducedReadCountsTag(counts); - final byte[] actualConversion = reducedRead.getByteArrayAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG); - for ( int i = 0; i < actualConversion.length; i++ ) - Assert.assertEquals(actualConversion[i], expectedConversion[i], "Conversion differs at position " + i + ": " + actualConversion[i] + " vs. " + expectedConversion[i]); - } } From 6c872308d8640cbcbaa8977d468193f5fe041568 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 13 Feb 2014 10:50:53 -0500 Subject: [PATCH 07/18] Add the capability to the N-cigar splitter to also hard-clip off overhangs based on observed split positions. We use a "manager" to keep track of observed splits and previous reads. This can be extended/modified in the future to try to salvage those overhangs instead of hard-clipping them and/or try other possible strategies. Added unit tests and more integration tests. --- .../walkers/rnaseq/OverhangFixingManager.java | 380 ++++++++++++++++++ .../gatk/walkers/rnaseq/SplitNCigarReads.java | 232 +++++------ .../rnaseq/OverhangFixingManagerUnitTest.java | 172 ++++++++ ...a => SplitNCigarReadsIntegrationTest.java} | 41 +- .../rnaseq/SplitNCigarReadsUnitTest.java | 37 +- 5 files changed, 720 insertions(+), 142 deletions(-) create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManager.java create mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManagerUnitTest.java rename protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/{SplitNCigarReadsIntegrationTests.java => SplitNCigarReadsIntegrationTest.java} (79%) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManager.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManager.java new file mode 100644 index 000000000..581a9e426 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManager.java @@ -0,0 +1,380 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMRecordCoordinateComparator; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +/** + * The class manages reads and splices and tries to apply overhang clipping when appropriate. + * Important note: although for efficiency the manager does try to send reads to the underlying writer in coordinate + * sorted order, it does NOT guarantee that it will do so in every case! So unless there's a good reason not to, + * methods that instantiate this manager should pass in a writer that does not assume the reads are pre-sorted. + */ +public class OverhangFixingManager { + + protected static final Logger logger = Logger.getLogger(OverhangFixingManager.class); + private static final boolean DEBUG = false; + + // how many reads should we store in memory before flushing the queue? + private final int MAX_RECORDS_IN_MEMORY; + + // how many mismatches do we tolerate in the overhangs? + private final int MAX_MISMATCHES_IN_OVERHANG; + + // how many bases do we tolerate in the overhang before deciding not to clip? + private final int MAX_BASES_IN_OVERHANG; + + // should we not bother fixing overhangs? + private final boolean doNotFixOverhangs; + + // where we ultimately write out our records + private final SAMFileWriter writer; + + // fasta reference reader to check overhanging edges in the exome reference sequence + private final CachingIndexedFastaSequenceFile referenceReader; + + // the genome loc parser + private final GenomeLocParser genomeLocParser; + + // the read cache + private final static int initialCapacity = 5000; + private PriorityQueue waitingReads = new PriorityQueue<>(initialCapacity, new SplitReadComparator()); + + // the set of current splices to use + private final Set splices = new TreeSet<>(new SpliceComparator()); + + protected static final int MAX_SPLICES_TO_KEEP = 1000; + + + /** + * + * @param writer actual writer + * @param genomeLocParser the GenomeLocParser object + * @param referenceReader the reference reader + * @param maxRecordsInMemory max records to keep in memory + * @param maxMismatchesInOverhangs max number of mismatches permitted in the overhangs before requiring clipping + * @param maxBasesInOverhangs max number of bases permitted in the overhangs before deciding not to clip + * @param doNotFixOverhangs if true, don't clip overhangs at all + */ + public OverhangFixingManager(final SAMFileWriter writer, + final GenomeLocParser genomeLocParser, + final CachingIndexedFastaSequenceFile referenceReader, + final int maxRecordsInMemory, + final int maxMismatchesInOverhangs, + final int maxBasesInOverhangs, + final boolean doNotFixOverhangs) { + this.writer = writer; + this.genomeLocParser = genomeLocParser; + this.referenceReader = referenceReader; + this.MAX_RECORDS_IN_MEMORY = maxRecordsInMemory; + this.MAX_MISMATCHES_IN_OVERHANG = maxMismatchesInOverhangs; + this.MAX_BASES_IN_OVERHANG = maxBasesInOverhangs; + this.doNotFixOverhangs = doNotFixOverhangs; + } + + public final int getNReadsInQueue() { return waitingReads.size(); } + + /** + * For testing purposes only + * + * @return the list of reads currently in the queue + */ + protected List getReadsInQueueForTesting() { + return new ArrayList<>(waitingReads); + } + + /** + * For testing purposes only + * + * @return the list of splices currently in the queue + */ + protected List getSplicesForTesting() { + return new ArrayList<>(splices); + } + + /** + * Add a new observed split to the list to use + * + * @param contig the contig + * @param start the start of the split, inclusive + * @param end the end of the split, inclusive + */ + public void addSplicePosition(final String contig, final int start, final int end) { + if ( doNotFixOverhangs ) + return; + + // is this a new splice? if not, we are done + final Splice splice = new Splice(contig, start, end); + if ( splices.contains(splice) ) + return; + + // initialize it with the reference context + // we don't want to do this until we know for sure that it's a new splice position + splice.initialize(referenceReader); + + // clear the set of old split positions seen if we hit a new contig + final boolean sameContig = splices.isEmpty() || splices.iterator().next().loc.getContig().equals(contig); + if ( !sameContig ) + splices.clear(); + + // run this position against the existing reads + for ( final SplitRead read : waitingReads ) + fixSplit(read, splice); + + splices.add(splice); + + if ( splices.size() > MAX_SPLICES_TO_KEEP ) + cleanSplices(); + } + + /** + * Add a read to the manager + * + * @param read the read to add + */ + public void addRead(final GATKSAMRecord read) { + if ( read == null ) throw new IllegalArgumentException("read added to manager is null, which is not allowed"); + + // if the new read is on a different contig or we have too many reads, then we need to flush the queue and clear the map + final boolean tooManyReads = getNReadsInQueue() >= MAX_RECORDS_IN_MEMORY; + final boolean encounteredNewContig = getNReadsInQueue() > 0 && !waitingReads.peek().read.getReferenceIndex().equals(read.getReferenceIndex()); + + if ( tooManyReads || encounteredNewContig ) { + if ( DEBUG ) logger.warn("Flushing queue on " + (tooManyReads ? "too many reads" : ("move to new contig: " + read.getReferenceName() + " from " + waitingReads.peek().read.getReferenceName())) + " at " + read.getAlignmentStart()); + + final int targetQueueSize = encounteredNewContig ? 0 : MAX_RECORDS_IN_MEMORY / 2; + + // write the required number of waiting reads to disk + while ( getNReadsInQueue() > targetQueueSize ) + writer.addAlignment(waitingReads.poll().read); + } + + final SplitRead splitRead = new SplitRead(read); + + // fix overhangs, as needed + for ( final Splice splice : splices) + fixSplit(splitRead, splice); + + // add the new read to the queue + waitingReads.add(splitRead); + } + + /** + * Clean up the list of splices + */ + private void cleanSplices() { + final int targetQueueSize = splices.size() / 2; + final Iterator iter = splices.iterator(); + for ( int i = 0; i < targetQueueSize; i++ ) { + iter.next(); + iter.remove(); + } + } + + /** + * Try to fix the given read using the given split + * + * @param read the read to fix + * @param splice the split (bad region to clip out) + */ + private void fixSplit(final SplitRead read, final Splice splice) { + // if the read doesn't even overlap the split position then we can just exit + if ( !splice.loc.overlapsP(read.loc) ) + return; + + if ( isLeftOverhang(read.loc, splice.loc) ) { + final int overhang = splice.loc.getStop() - read.loc.getStart() + 1; + if ( overhangingBasesMismatch(read.read.getReadBases(), 0, splice.reference, splice.reference.length - overhang, overhang) ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipByReadCoordinates(read.read, 0, overhang - 1); + read.setRead(clippedRead); + } + } + else if ( isRightOverhang(read.loc, splice.loc) ) { + final int overhang = read.loc.getStop() - splice.loc.getStart() + 1; + if ( overhangingBasesMismatch(read.read.getReadBases(), read.read.getReadLength() - overhang, splice.reference, 0, overhang) ) { + final GATKSAMRecord clippedRead = ReadClipper.hardClipByReadCoordinates(read.read, read.read.getReadLength() - overhang, read.read.getReadLength() - 1); + read.setRead(clippedRead); + } + } + } + + /** + * Is this a proper overhang on the left side of the read? + * + * @param readLoc the read's loc + * @param spliceLoc the split's loc + * @return true if it's a left side overhang + */ + protected static boolean isLeftOverhang(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + return readLoc.getStart() <= spliceLoc.getStop() && readLoc.getStart() > spliceLoc.getStart() && readLoc.getStop() > spliceLoc.getStop(); + } + + /** + * Is this a proper overhang on the right side of the read? + * + * @param readLoc the read's loc + * @param spliceLoc the split's loc + * @return true if it's a right side overhang + */ + protected static boolean isRightOverhang(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + return readLoc.getStop() >= spliceLoc.getStart() && readLoc.getStop() < spliceLoc.getStop() && readLoc.getStart() < spliceLoc.getStart(); + } + + /** + * Are there too many mismatches to the reference among the overhanging bases? + * + * @param read the read bases + * @param readStartIndex where to start on the read + * @param reference the reference bases + * @param referenceStartIndex where to start on the reference + * @param spanToTest how many bases to test + * @return true if too many overhanging bases mismatch, false otherwise + */ + protected boolean overhangingBasesMismatch(final byte[] read, + final int readStartIndex, + final byte[] reference, + final int referenceStartIndex, + final int spanToTest) { + // don't process too small a span, too large a span, or a span that is most of a read + if ( spanToTest < 1 || spanToTest > MAX_BASES_IN_OVERHANG || spanToTest > read.length / 2 ) + return false; + + int numMismatchesSeen = 0; + for ( int i = 0; i < spanToTest; i++ ) { + if ( read[readStartIndex + i] != reference[referenceStartIndex + i] ) { + if ( ++numMismatchesSeen > MAX_MISMATCHES_IN_OVERHANG ) + return true; + } + } + + // we can still mismatch overall if at least half of the bases mismatch + return numMismatchesSeen >= ((spanToTest+1)/2); + } + + /** + * Close out the manager stream by clearing the read cache + */ + public void close() { + // write out all of the remaining reads + while ( ! waitingReads.isEmpty() ) + writer.addAlignment(waitingReads.poll().read); + } + + // class to represent the reads with their soft-clip-included GenomeLocs + protected final class SplitRead { + + public GATKSAMRecord read; + public GenomeLoc loc; + + public SplitRead(final GATKSAMRecord read) { + setRead(read); + } + + public void setRead(final GATKSAMRecord read) { + if ( !read.isEmpty() ) { + this.read = read; + loc = genomeLocParser.createGenomeLoc(read.getReferenceName(), read.getSoftStart(), read.getSoftEnd()); + } + } + } + + // class to represent the comparator for the split reads + private final class SplitReadComparator implements Comparator { + + private final SAMRecordCoordinateComparator readComparator; + + public SplitReadComparator() { + readComparator = new SAMRecordCoordinateComparator(); + } + + public int compare(final SplitRead read1, final SplitRead read2) { + return readComparator.compare(read1.read, read2.read); + } + } + + // class to represent the split positions + protected final class Splice { + + public final GenomeLoc loc; + public byte[] reference; + + public Splice(final String contig, final int start, final int end) { + loc = genomeLocParser.createGenomeLoc(contig, start, end); + } + + public void initialize(final CachingIndexedFastaSequenceFile referenceReader) { + reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); + } + + @Override + public boolean equals(final Object other) { + return other != null && (other instanceof Splice) && this.loc.equals(((Splice)other).loc); + } + + @Override + public int hashCode() { + return loc.hashCode(); + } + } + + // class to represent the comparator for the split reads + private final class SpliceComparator implements Comparator { + + public int compare(final Splice position1, final Splice position2) { + return position1.loc.compareTo(position2.loc); + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java index 223745431..6b9fca312 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReads.java @@ -47,30 +47,28 @@ package org.broadinstitute.sting.gatk.walkers.rnaseq; import net.sf.samtools.*; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.iterators.ReadTransformer; -import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.io.PrintStream; -import java.util.*; +import java.io.FileNotFoundException; /** * @@ -88,119 +86,138 @@ import java.util.*; @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class SplitNCigarReads extends ReadWalker, SAMFileWriter> { +public class SplitNCigarReads extends ReadWalker { + + // The name that will go in the @PG tag + public static final String PROGRAM_RECORD_NAME = "GATK SplitNCigarReads"; + @Output(doc="Write output to this BAM filename instead of STDOUT") - StingSAMFileWriter out; - - @Argument(required = false) - PrintStream splitPositionsOutput = System.out; - - @Argument(fullName="outputAsBED", shortName="bed", doc="Output as BED file", required=false) - boolean outputAsBED = false; - - @Argument(fullName="printSplitPositions", shortName="splitPosition", doc="print the split positions", required=false) - boolean printSplitPositions = false; - - public static final String PROGRAM_RECORD_NAME = "GATK SplitNCigarReads"; // The name that will go in the @PG tag - // public static SplitPositions splitPositions = null; - public static String results = ""; + protected StingSAMFileWriter writer; /** - * The initialize function. + * For expert users only! To minimize memory consumption you can lower this number, but then the tool may skip + * overhang fixing in regions with too much coverage. Just make sure to give Java enough memory! 4Gb should be + * enough with the default value. */ + @Advanced + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the BAM writer", required=false) + protected int MAX_RECORDS_IN_MEMORY = 150000; + + /** + * If there are more than this many mismatches within the overhang regions, the whole overhang will get hard-clipped out. + * It is still possible in some cases that the overhang could get clipped if the number of mismatches do not exceed this + * value, e.g. if most of the overhang mismatches. + */ + @Advanced + @Argument(fullName="maxMismatchesInOverhang", shortName="maxMismatches", doc="max number of mismatches allowed in the overhang", required=false) + protected int MAX_MISMATCHES_IN_OVERHANG = 1; + + /** + * If there are more than this many bases in the overhang, we won't try to hard-clip them out + */ + @Advanced + @Argument(fullName="maxBasesInOverhang", shortName="maxOverhang", doc="max number of bases allowed in the overhang", required=false) + protected int MAX_BASES_TO_CLIP = 40; + + @Argument(fullName="doNotFixOverhangs", shortName="doNotFixOverhangs", doc="do not have the walker hard-clip overhanging sections of the reads", required=false) + protected boolean doNotFixOverhangs = false; + + @Hidden + @Argument(fullName = "no_pg_tag", shortName = "npt", doc = "Necessary for integration tests", required = false) + protected boolean NO_PG_TAG = false; + + /** + * This stores all of the already-split reads and manages any processing (e.g. clipping overhangs) that happens to them. + * It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class. + */ + protected OverhangFixingManager overhangManager; + + + @Override public void initialize() { final GenomeAnalysisEngine toolkit = getToolkit(); - final boolean preSorted = false; - if (getToolkit() != null) { - Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME); + if ( !NO_PG_TAG ) { + // we don't want to assume that reads will be written in order by the manager because in deep, deep pileups it won't work + Utils.setupWriter(writer, toolkit, toolkit.getSAMFileHeader(), false, this, PROGRAM_RECORD_NAME); + } + + try { + final CachingIndexedFastaSequenceFile referenceReader = new CachingIndexedFastaSequenceFile(toolkit.getArguments().referenceFile); + overhangManager = new OverhangFixingManager(writer, toolkit.getGenomeLocParser(), referenceReader, MAX_RECORDS_IN_MEMORY, MAX_MISMATCHES_IN_OVERHANG, MAX_BASES_TO_CLIP, doNotFixOverhangs); + } + catch (FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(toolkit.getArguments().referenceFile, ex); } - out.setPresorted(preSorted); - // splitPositions = new SplitPositions(); } - /** - * The reads map function. - * - * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a GATKSAMRecord - * - * @return a list of split read if there are N's in the cigar string, or the read itself. - */ - public List map(final ReferenceContext ref,final GATKSAMRecord read,final RefMetaDataTracker metaDataTracker) { - return splitNCigarRead(read); + @Override + public GATKSAMRecord map(final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker) { + return read; + } + + @Override + public OverhangFixingManager reduceInit() { + return overhangManager; + } + + @Override + public OverhangFixingManager reduce(final GATKSAMRecord read, final OverhangFixingManager manager) { + splitNCigarRead(read, manager); + return manager; + } + + @Override + public void onTraversalDone(final OverhangFixingManager manager) { + manager.close(); } /** * Goes through the cigar string of the read and create new reads for each consecutive non-N elements (while hard clipping the rest of the read). * For example: for a read with cigar '1H2M2D1M2N1M2I1N1M2S' 3 new reads will be created with cigar strings: 1H2M2D1M, 1M2I and 1M2S * - * @param read the read to split, as a GATKSAMRecord - * @return a list of split read if there are N's in the cigar string, or the read itself. + * @param read the read to split + * @param manager the output manager */ + public static void splitNCigarRead(final GATKSAMRecord read, final OverhangFixingManager manager) { + final int numCigarElements = read.getCigar().numCigarElements(); - public static List splitNCigarRead(final GATKSAMRecord read){ - final List splitReads = new ArrayList<>(); int firstCigarIndex = 0; - for (int i = 0; i < read.getCigar().numCigarElements(); i ++){ + for ( int i = 0; i < numCigarElements; i++ ) { final CigarElement cigarElement = read.getCigar().getCigarElement(i); - if(cigarElement.getOperator() == CigarOperator.N){ - final boolean addToSplitPositions = true; - splitReads.add(splitReadBasedOnCigar(read,firstCigarIndex,i, addToSplitPositions)); + if (cigarElement.getOperator() == CigarOperator.N) { + manager.addRead(splitReadBasedOnCigar(read, firstCigarIndex, i, manager)); firstCigarIndex = i+1; } } - //if there are no N's in the read - if (firstCigarIndex == 0) - splitReads.add(read); + // if there are no N's in the read + if (firstCigarIndex == 0) { + manager.addRead(read); + } //add the last section of the read: from the last N to the the end of the read // (it will be done for all the usual cigar string that does not end with N) - else if(firstCigarIndex < read.getCigar().numCigarElements()){ - final boolean addToSplitPositions = false; - splitReads.add(splitReadBasedOnCigar(read,firstCigarIndex,read.getCigar().numCigarElements(), addToSplitPositions)); + else if (firstCigarIndex < numCigarElements) { + manager.addRead(splitReadBasedOnCigar(read, firstCigarIndex, numCigarElements, null)); } - return splitReads; - } - - - /** - * reduceInit is called once before any calls to the map function. We use it here to setup the splitPositionsOutput - * bam file, if it was specified on the command line - * - * @return SAMFileWriter, set to the BAM splitPositionsOutput file if the command line option was set, null otherwise - */ - public SAMFileWriter reduceInit() { - return out; } /** - * given a read and a splitPositionsOutput location, reduce by emitting the read + * Pull out an individual split position for a read * - * @param reads the split reads itself - * @param output the splitPositionsOutput source - * @return the SAMFileWriter, so that the next reduce can emit to the same source + * @param read the read being split + * @param cigarStartIndex the index of the first cigar element to keep + * @param cigarEndIndex the index of the last cigar element to keep + * @param forSplitPositions the manager for keeping track of split positions; can be null + * @return a non-null read representing the section of the original read being split out */ - public SAMFileWriter reduce(final List reads,final SAMFileWriter output ) { - for (final GATKSAMRecord read: reads) - output.addAlignment(read); - return output; - } - - public void onTraversalDone(SAMFileWriter readResult) { - super.onTraversalDone(readResult); - if(printSplitPositions) - splitPositionsOutput.println(results); - // splitPositionsOutput.println(splitPositions); - - } - private static GATKSAMRecord splitReadBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex, final boolean addToSplitPositions){ + private static GATKSAMRecord splitReadBasedOnCigar(final GATKSAMRecord read, final int cigarStartIndex, final int cigarEndIndex, final OverhangFixingManager forSplitPositions) { int cigarFirstIndex = cigarStartIndex; int cigarSecondIndex = cigarEndIndex; - //in case a section of the read is end or start with D (for example the first section in 1M1D1N1M is 1M1D), we should trim this cigar element - // it can be if, but it was kept as while to make sure the code can work with Cigar string that were not "cleaned" + //in case a section of the read ends or starts with D (for example the first section in 1M1D1N1M is 1M1D), we should trim this cigar element + // it can be 'if', but it was kept as 'while' to make sure the code can work with Cigar strings that were not "cleaned" while(read.getCigar().getCigarElement(cigarFirstIndex).getOperator().equals(CigarOperator.D)) cigarFirstIndex++; while(read.getCigar().getCigarElement(cigarSecondIndex-1).getOperator().equals(CigarOperator.D)) @@ -213,54 +230,13 @@ public class SplitNCigarReads extends ReadWalker, SAMFileWri final int startRefIndex = read.getOriginalAlignmentStart() + CigarUtils.countRefBasesBasedOnCigar(read,0,cigarFirstIndex); //goes through the prefix of the cigar (up to cigarStartIndex) and move the reference index. final int stopRefIndex = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarSecondIndex)-1; //goes through a consecutive non-N section of the cigar (up to cigarEndIndex) and move the reference index. - if(addToSplitPositions){ - final int splitPosition = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end. + if ( forSplitPositions != null ) { final String contig = read.getReferenceName(); -// results += String.format("%s:%d-%d\n", contig, splitPosition, splitPosition ); -// splitPositions.addSplitPosition(contig,splitPosition); + final int splitStart = startRefIndex + CigarUtils.countRefBasesBasedOnCigar(read,cigarFirstIndex,cigarEndIndex); //we use cigarEndIndex instead of cigarSecondIndex so we won't take into account the D's at the end. + final int splitEnd = splitStart + read.getCigar().getCigarElement(cigarEndIndex).getLength() - 1; + forSplitPositions.addSplicePosition(contig, splitStart, splitEnd); } return ReadClipper.hardClipToRegionIncludingClippedBases(read, startRefIndex, stopRefIndex); - } - - private class SplitPosition { - public final String contig; - public final int start; - public final int end; - - public SplitPosition(final String c, final int position) { - contig = c; - start = position; - end = position; - } - } - - - private class SplitPositions { - private final HashSet splitPositions; - - public SplitPositions() { - splitPositions = new HashSet<>(); - } - - public void addSplitPosition(final String contig, final int position) { - final SplitPosition newSplitPosition = new SplitPosition(contig, position); - splitPositions.add(newSplitPosition); - } - - public String toString() { - String result = ""; // = "Contig\tstart\tstop\n"; - - for (SplitPosition position: splitPositions) { - if (outputAsBED) - result += String.format("%s\t%d\t%d\n", position.contig, position.start-1, position.end ); - else - result += String.format("%s:%d-%d\n", position.contig, position.start, position.end ); - } - return result; - } - } - - } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManagerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManagerUnitTest.java new file mode 100644 index 000000000..62f6bcfbd --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/OverhangFixingManagerUnitTest.java @@ -0,0 +1,172 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.rnaseq; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class OverhangFixingManagerUnitTest extends BaseTest { + + private CachingIndexedFastaSequenceFile referenceReader; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void setup() throws FileNotFoundException { + referenceReader = new CachingIndexedFastaSequenceFile(new File(b37KGReference)); + genomeLocParser = new GenomeLocParser(referenceReader.getSequenceDictionary()); + } + + @Test + public void testCleanSplices() { + + final OverhangFixingManager manager = new OverhangFixingManager(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + + final int offset = 10; + for ( int i = 0; i < OverhangFixingManager.MAX_SPLICES_TO_KEEP + 1; i++ ) + manager.addSplicePosition("20", offset + i, offset + 1 + i); + + final List splices = manager.getSplicesForTesting(); + + Assert.assertEquals(splices.size(), (OverhangFixingManager.MAX_SPLICES_TO_KEEP / 2) + 1); + + final int minStartPos = (OverhangFixingManager.MAX_SPLICES_TO_KEEP / 2) + offset; + + for ( final OverhangFixingManager.Splice splice : splices ) + Assert.assertTrue(splice.loc.getStart() >= minStartPos); + } + + @DataProvider(name = "OverhangTest") + public Object[][] makeOverhangData() { + final List tests = new ArrayList<>(); + for ( int leftRead : Arrays.asList(10, 20, 30, 40) ) { + for ( int rightRead : Arrays.asList(20, 30, 40, 50) ) { + if ( leftRead >= rightRead ) + continue; + for ( int leftSplice : Arrays.asList(10, 20, 30) ) { + for ( int rightSplice : Arrays.asList(20, 30, 40) ) { + if ( leftSplice >= rightSplice ) + continue; + + final GenomeLoc readLoc = genomeLocParser.createGenomeLoc("1", leftRead, rightRead); + final GenomeLoc spliceLoc = genomeLocParser.createGenomeLoc("1", leftSplice, rightSplice); + tests.add(new Object[]{readLoc, spliceLoc}); + } + } + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "OverhangTest") + public void testLeftOverhangs(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + final boolean isValidOverhang = readLoc.getStart() <= spliceLoc.getStop() && + readLoc.getStop() > spliceLoc.getStop() && + readLoc.getStart() > spliceLoc.getStart(); + Assert.assertEquals(OverhangFixingManager.isLeftOverhang(readLoc, spliceLoc), isValidOverhang, readLoc + " vs. " + spliceLoc); + } + + @Test(dataProvider = "OverhangTest") + public void testRightOverhangs(final GenomeLoc readLoc, final GenomeLoc spliceLoc) { + final boolean isValidOverhang = readLoc.getStop() >= spliceLoc.getStart() && + readLoc.getStop() < spliceLoc.getStop() && + readLoc.getStart() < spliceLoc.getStart(); + Assert.assertEquals(OverhangFixingManager.isRightOverhang(readLoc, spliceLoc), isValidOverhang, readLoc + " vs. " + spliceLoc); + } + + @DataProvider(name = "MismatchEdgeConditionTest") + public Object[][] makeMismatchEdgeConditionData() { + final List tests = new ArrayList<>(); + tests.add(new Object[]{null, 1, null, 1, 0}); + tests.add(new Object[]{null, 1, null, 1, 100}); + tests.add(new Object[]{new byte[4], 1, null, 1, 3}); + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MismatchEdgeConditionTest") + public void testMismatchEdgeCondition(final byte[] read, final int readStart, final byte[] ref, final int refStart, final int overhang) { + final OverhangFixingManager manager = new OverhangFixingManager(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + Assert.assertFalse(manager.overhangingBasesMismatch(read, readStart, ref, refStart, overhang)); + } + + @DataProvider(name = "MismatchTest") + public Object[][] makeMismatchData() { + final List tests = new ArrayList<>(); + + final byte[] AAAA = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A'}; + final byte[] AAAC = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'C'}; + final byte[] AAAAAA = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'A'}; + final byte[] AAAACA = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'C', (byte)'A'}; + final byte[] AAAACC = new byte[]{(byte)'A', (byte)'A', (byte)'A', (byte)'A', (byte)'C', (byte)'C'}; + + tests.add(new Object[]{AAAA, 2, AAAA, 2, 2, false}); + tests.add(new Object[]{AAAA, 2, AAAC, 2, 2, true}); + tests.add(new Object[]{AAAAAA, 3, AAAACA, 3, 3, false}); + tests.add(new Object[]{AAAAAA, 3, AAAACC, 3, 3, true}); + tests.add(new Object[]{AAAAAA, 4, AAAACC, 4, 2, true}); + tests.add(new Object[]{AAAAAA, 2, AAAACC, 2, 3, false}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "MismatchTest") + public void testMismatch(final byte[] read, final int readStart, final byte[] ref, final int refStart, final int overhang, final boolean expected) { + final OverhangFixingManager manager = new OverhangFixingManager(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + Assert.assertEquals(manager.overhangingBasesMismatch(read, readStart, ref, refStart, overhang), expected, new String(read) + " vs. " + new String(ref) + " @" + overhang); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java similarity index 79% rename from protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java index 398fe221c..87af68fc4 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTests.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java @@ -48,7 +48,6 @@ package org.broadinstitute.sting.gatk.walkers.rnaseq; import org.broadinstitute.sting.WalkerTest; import org.testng.annotations.Test; -import org.broadinstitute.sting.BaseTest; import java.util.Arrays; /** @@ -57,27 +56,57 @@ import java.util.Arrays; * Date: 12/5/13 * Time: 1:04 PM */ -public class SplitNCigarReadsIntegrationTests extends WalkerTest { +public class SplitNCigarReadsIntegrationTest extends WalkerTest { - @Test + @Test(enabled = false) // contain reads without N's, with N's and with N's and I's + // TODO -- Ami: please put the bam file in the repo public void testSplitWithInsertions() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T SplitNCigarReads -R " + BaseTest.b37KGReference + " -I " + BaseTest.privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withI.bam -o %s -U ALLOW_N_CIGAR_READS", 1, + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withI.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, Arrays.asList("037c72fe1572efb63cccbe0a8dda3cb1")); executeTest("test split N cigar reads with insertions", spec); } - @Test + @Test(enabled = false) // contain reads without N's, with N's and with N's and D's, and also with more then one N element in the cigar. + // TODO -- Ami: please put the bam file in the repo public void testSplitWithDeletions() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T SplitNCigarReads -R " + BaseTest.b37KGReference + " -I " + BaseTest.privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withD.bam -o %s -U ALLOW_N_CIGAR_READS", 1, + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "SplitNCigarReads.integrationTest.unsplitReads.withD.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, Arrays.asList("8472005c16353715025353d6d453faf4")); executeTest("test split N cigar reads with deletions", spec); } + @Test + public void testSplitsWithOverhangs() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("2832abc680c6b5a0219702ad5bf22f01")); + executeTest("test splits with overhangs", spec); + } + @Test + public void testSplitsWithOverhangsNotClipping() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads --doNotFixOverhangs -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("59783610006bf7a1ccae57ee2016123b")); + executeTest("test splits with overhangs not clipping", spec); + } + @Test + public void testSplitsWithOverhangs0Mismatches() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads --maxMismatchesInOverhang 0 -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("7547a5fc41ebfd1bbe62ce854b37b6ef")); + executeTest("test splits with overhangs 0 mismatches", spec); + } + @Test + public void testSplitsWithOverhangs5BasesInOverhang() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads --maxBasesInOverhang 5 -R " + b37KGReference + " -I " + privateTestDir + "NA12878.RNAseq.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS", 1, + Arrays.asList("f222eb02b003c08d4a606ab1bcb7931b")); + executeTest("test splits with overhangs 5 bases in overhang", spec); + } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java index 7eb95877d..d0f8280af 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/rnaseq/SplitNCigarReadsUnitTest.java @@ -49,12 +49,17 @@ package org.broadinstitute.sting.gatk.walkers.rnaseq; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.gatk.walkers.rnaseq.SplitNCigarReads; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.clipping.ReadClipperTestUtils; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; +import java.io.File; +import java.io.FileNotFoundException; import java.util.Arrays; import java.util.LinkedList; import java.util.List; @@ -69,7 +74,7 @@ import java.util.List; * Date: 11/14/13 * Time: 6:49 PM */ -public class SplitNCigarReadsUnitTest { +public class SplitNCigarReadsUnitTest extends BaseTest { final static CigarElement[] cigarElements = { new CigarElement(1, CigarOperator.HARD_CLIP), new CigarElement(1, CigarOperator.SOFT_CLIP), @@ -79,6 +84,21 @@ public class SplitNCigarReadsUnitTest { new CigarElement(1, CigarOperator.SKIPPED_REGION) }; + private CachingIndexedFastaSequenceFile referenceReader; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void setup() throws FileNotFoundException { + referenceReader = new CachingIndexedFastaSequenceFile(new File(hg18Reference)); + genomeLocParser = new GenomeLocParser(referenceReader.getSequenceDictionary()); + } + + private final class TestManager extends OverhangFixingManager { + public TestManager() { + super(null, genomeLocParser, referenceReader, 10000, 1, 40, false); + } + } + @Test(enabled = true) public void splitReadAtN() { final int cigarStringLength = 10; @@ -94,26 +114,27 @@ public class SplitNCigarReadsUnitTest { for(Cigar cigar: cigarList){ - final int numOfSplits = numOfNElements(cigar.getCigarElements()); if(numOfSplits != 0 && isCigarDoesNotHaveEmptyRegionsBetweenNs(cigar)){ + final TestManager manager = new TestManager(); GATKSAMRecord read = ReadClipperTestUtils.makeReadFromCigar(cigar); - List splitReads = SplitNCigarReads.splitNCigarRead(read); + SplitNCigarReads.splitNCigarRead(read, manager); + List splitReads = manager.getReadsInQueueForTesting(); final int expectedReads = numOfSplits+1; Assert.assertEquals(splitReads.size(),expectedReads,"wrong number of reads after split read with cigar: "+cigar+" at Ns [expected]: "+expectedReads+" [actual value]: "+splitReads.size()); final List readLengths = consecutiveNonNElements(read.getCigar().getCigarElements()); int index = 0; int offsetFromStart = 0; - for(GATKSAMRecord splitRead: splitReads){ + for(final OverhangFixingManager.SplitRead splitRead: splitReads){ int expectedLength = readLengths.get(index); - Assert.assertTrue(splitRead.getReadLength() == expectedLength, + Assert.assertTrue(splitRead.read.getReadLength() == expectedLength, "the "+index+" (starting with 0) split read has a wrong length.\n" + "cigar of original read: "+cigar+"\n"+ "expected length: "+expectedLength+"\n"+ - "actual length: "+splitRead.getReadLength()+"\n"); - assertBases(splitRead.getReadBases(), read.getReadBases(), offsetFromStart); + "actual length: "+splitRead.read.getReadLength()+"\n"); + assertBases(splitRead.read.getReadBases(), read.getReadBases(), offsetFromStart); index++; offsetFromStart += expectedLength; } From fa65716fe944112ea6f05271dce3e14ea19795a1 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 2 Feb 2014 04:16:31 -0500 Subject: [PATCH 08/18] Added code to retrieve dangling heads from the read threading graph (previously we were rescuing just the tails). The purpose of this is to be able to call SNPs that fall at the beginning of a capture region (or exon). Before, the read threading code would only start threading from the first kmer that matched the reference. But that means that, in the case of a SNP at the beginning of an exome, it wouldn't start threading the read until after the SNP position - so we'd lose the SNP. For now, this is still very experimental. It works well for RNAseq data, but does introduce FPs in normal exomes. I know why this is and how to fix it, but it requires a much larger fix to the HC: the HC needs to pass all reads and bases to the annotation engine (like UG does) instead of just the high quality ones. So for now, the head merging is disabled by default. As per reviewer comments, I moved the head and tail merging code out into their own class. --- .../haplotypecaller/HaplotypeCaller.java | 8 + .../haplotypecaller/LocalAssemblyEngine.java | 9 + .../haplotypecaller/graphs/BaseGraph.java | 18 - .../graphs/DeBruijnVertex.java | 8 - .../DanglingChainMergingGraph.java | 520 ++++++++++++++++++ .../readthreading/HaplotypeGraph.java | 5 +- .../readthreading/ReadThreadingAssembler.java | 3 +- .../readthreading/ReadThreadingGraph.java | 300 +--------- .../VariantAnnotatorIntegrationTest.java | 2 +- ...lexAndSymbolicVariantsIntegrationTest.java | 4 +- .../HaplotypeCallerIntegrationTest.java | 10 +- .../graphs/BaseGraphUnitTest.java | 15 - .../DanglingChainMergingGraphUnitTest.java | 230 ++++++++ .../ReadThreadingAssemblerUnitTest.java | 23 + .../ReadThreadingGraphUnitTest.java | 83 --- 15 files changed, 829 insertions(+), 409 deletions(-) create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java create mode 100644 protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index ad05e87d9..13403c0ac 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -278,6 +278,13 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="numPruningSamples", shortName="numPruningSamples", doc="The number of samples that must pass the minPuning factor in order for the path to be kept", required = false) protected int numPruningSamples = 1; + /** + * This mode is currently experimental and should only be used in the RNA-seq calling pipeline. + */ + @Advanced + @Argument(fullName="recoverDanglingHeads", shortName="recoverDanglingHeads", doc="Should we enable dangling head recovery in the read threading assembler?", required = false) + protected boolean recoverDanglingHeads = false; + @Hidden @Argument(fullName="dontRecoverDanglingTails", shortName="dontRecoverDanglingTails", doc="Should we disable dangling tail recovery in the read threading assembler?", required = false) protected boolean dontRecoverDanglingTails = false; @@ -634,6 +641,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In assemblyEngine.setDebugGraphTransformations(debugGraphTransformations); assemblyEngine.setAllowCyclesInKmerGraphToGeneratePaths(allowCyclesInKmerGraphToGeneratePaths); assemblyEngine.setRecoverDanglingTails(!dontRecoverDanglingTails); + assemblyEngine.setRecoverDanglingHeads(recoverDanglingHeads); assemblyEngine.setMinBaseQualityToUseInAssembly(MIN_BASE_QUALTY_SCORE); MIN_TAIL_QUALITY = (byte)(MIN_BASE_QUALTY_SCORE - 1); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index d0e28d878..d49827405 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -86,6 +86,7 @@ public abstract class LocalAssemblyEngine { protected boolean allowCyclesInKmerGraphToGeneratePaths = false; protected boolean debugGraphTransformations = false; protected boolean recoverDanglingTails = true; + protected boolean recoverDanglingHeads = true; protected byte minBaseQualityToUseInAssembly = DEFAULT_MIN_BASE_QUALITY_TO_USE; protected int pruneFactor = 2; @@ -456,4 +457,12 @@ public abstract class LocalAssemblyEngine { public void setRecoverDanglingTails(boolean recoverDanglingTails) { this.recoverDanglingTails = recoverDanglingTails; } + + public boolean isRecoverDanglingHeads() { + return recoverDanglingHeads; + } + + public void setRecoverDanglingHeads(boolean recoverDanglingHeads) { + this.recoverDanglingHeads = recoverDanglingHeads; + } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index edd8dbb16..c9d51b81b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -646,24 +646,6 @@ public class BaseGraph extends Default '}'; } - /** - * The base sequence for the given path. - * Note, this assumes that the path does not start with a source node. - * - * @param path the list of vertexes that make up the path - * @return non-null sequence of bases corresponding to the given path - */ - @Ensures({"result != null"}) - public byte[] getBasesForPath(final List path) { - if ( path == null ) throw new IllegalArgumentException("Path cannot be null"); - - final StringBuffer sb = new StringBuffer(); - for ( final DeBruijnVertex v : path ) - sb.append((char)v.getSuffix()); - - return sb.toString().getBytes(); - } - /** * Get the set of vertices within distance edges of source, regardless of edge direction * diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java index ec2ccff20..cf95f6a5a 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeBruijnVertex.java @@ -65,14 +65,6 @@ public class DeBruijnVertex extends BaseVertex { super(sequence); } - /** - * For testing purposes only - * @param sequence - */ - protected DeBruijnVertex( final String sequence ) { - this(sequence.getBytes()); - } - /** * Get the kmer size for this DeBruijnVertex * @return integer >= 1 diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java new file mode 100644 index 000000000..e59d39a97 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java @@ -0,0 +1,520 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import com.google.java.contract.Ensures; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.smithwaterman.*; +import org.jgrapht.EdgeFactory; + +import java.util.*; + +public abstract class DanglingChainMergingGraph extends BaseGraph { + + private static final int MAX_CIGAR_COMPLEXITY = 3; + private static final int MIN_DANGLING_TAIL_LENGTH = 5; // SNP + 3 stabilizing nodes + the LCA + private static final int MAXIMUM_MISMATCHES_IN_DANGLING_HEAD_MERGE = 1; + + protected boolean alreadyBuilt; + + /** + * Create a new ReadThreadingAssembler using kmerSize for matching + * @param kmerSize must be >= 1 + */ + protected DanglingChainMergingGraph(final int kmerSize, final EdgeFactory edgeFactory) { + super(kmerSize, edgeFactory); + } + + /** + * Edge factory that encapsulates the numPruningSamples assembly parameter + */ + protected static class MyEdgeFactory implements EdgeFactory { + final int numPruningSamples; + + public MyEdgeFactory(int numPruningSamples) { + this.numPruningSamples = numPruningSamples; + } + + @Override + public MultiSampleEdge createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex) { + return new MultiSampleEdge(false, 1, numPruningSamples); + } + + public MultiSampleEdge createEdge(final boolean isRef, final int multiplicity) { + return new MultiSampleEdge(isRef, multiplicity, numPruningSamples); + } + + } + + /** + * Class to keep track of the important dangling chain merging data + */ + protected final class DanglingChainMergeHelper { + final List danglingPath, referencePath; + final byte[] danglingPathString, referencePathString; + final Cigar cigar; + + public DanglingChainMergeHelper(final List danglingPath, + final List referencePath, + final byte[] danglingPathString, + final byte[] referencePathString, + final Cigar cigar) { + this.danglingPath = danglingPath; + this.referencePath = referencePath; + this.danglingPathString = danglingPathString; + this.referencePathString = referencePathString; + this.cigar = cigar; + } + } + + /** + * Try to recover dangling tails + * + * @param pruneFactor the prune factor to use in ignoring chain pieces + */ + public void recoverDanglingTails(final int pruneFactor) { + if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); + + int attempted = 0; + int nRecovered = 0; + for ( final MultiDeBruijnVertex v : vertexSet() ) { + if ( outDegreeOf(v) == 0 && ! isRefSink(v) ) { + attempted++; + nRecovered += recoverDanglingTail(v, pruneFactor); + } + } + + logger.debug("Recovered " + nRecovered + " of " + attempted + " dangling tails"); + } + + /** + * Try to recover dangling heads + * + * @param pruneFactor the prune factor to use in ignoring chain pieces + */ + public void recoverDanglingHeads(final int pruneFactor) { + if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingHeads requires the graph be already built"); + + // we need to build a list of dangling heads because that process can modify the graph (and otherwise generate + // a ConcurrentModificationException if we do it while iterating over the vertexes) + final List danglingHeads = new ArrayList<>(); + + int attempted = 0; + int nRecovered = 0; + for ( final MultiDeBruijnVertex v : vertexSet() ) { + if ( inDegreeOf(v) == 0 && ! isRefSource(v) ) + danglingHeads.add(v); + } + + // now we can try to recover the dangling heads + for ( final MultiDeBruijnVertex v : danglingHeads ) { + attempted++; + nRecovered += recoverDanglingHead(v, pruneFactor); + } + + logger.debug("Recovered " + nRecovered + " of " + attempted + " dangling heads"); + } + + /** + * Attempt to attach vertex with out-degree == 0 to the graph + * + * @param vertex the vertex to recover + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return 1 if we successfully recovered the vertex and 0 otherwise + */ + protected int recoverDanglingTail(final MultiDeBruijnVertex vertex, final int pruneFactor) { + if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); + + // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths + final DanglingChainMergeHelper danglingTailMergeResult = generateCigarAgainstDownwardsReferencePath(vertex, pruneFactor); + + // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path + if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar, false, true) ) + return 0; + + // merge + return mergeDanglingTail(danglingTailMergeResult); + } + + /** + * Attempt to attach vertex with in-degree == 0, or a vertex on its path, to the graph + * + * @param vertex the vertex to recover + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return 1 if we successfully recovered a vertex and 0 otherwise + */ + protected int recoverDanglingHead(final MultiDeBruijnVertex vertex, final int pruneFactor) { + if ( inDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling head for " + vertex + " but it has in-degree > 0"); + + // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths + final DanglingChainMergeHelper danglingHeadMergeResult = generateCigarAgainstUpwardsReferencePath(vertex, pruneFactor); + + // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path + if ( danglingHeadMergeResult == null || ! cigarIsOkayToMerge(danglingHeadMergeResult.cigar, true, false) ) + return 0; + + // merge + return mergeDanglingHead(danglingHeadMergeResult); + } + + /** + * Determine whether the provided cigar is okay to merge into the reference path + * + * @param cigar the cigar to analyze + * @param requireFirstElementM if true, require that the first cigar element be an M operator in order for it to be okay + * @param requireLastElementM if true, require that the last cigar element be an M operator in order for it to be okay + * @return true if it's okay to merge, false otherwise + */ + protected boolean cigarIsOkayToMerge(final Cigar cigar, final boolean requireFirstElementM, final boolean requireLastElementM) { + + final List elements = cigar.getCigarElements(); + final int numElements = elements.size(); + + // don't allow more than a couple of different ops + if ( numElements == 0 || numElements > MAX_CIGAR_COMPLEXITY ) + return false; + + // the last element must be an M + if ( requireFirstElementM && elements.get(0).getOperator() != CigarOperator.M ) + return false; + + // the last element must be an M + if ( requireLastElementM && elements.get(numElements - 1).getOperator() != CigarOperator.M ) + return false; + + // TODO -- do we want to check whether the Ms mismatch too much also? + + return true; + } + + /** + * Actually merge the dangling tail if possible + * + * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference + * @return 1 if merge was successful, 0 otherwise + */ + protected int mergeDanglingTail(final DanglingChainMergeHelper danglingTailMergeResult) { + + final List elements = danglingTailMergeResult.cigar.getCigarElements(); + final CigarElement lastElement = elements.get(elements.size() - 1); + if ( lastElement.getOperator() != CigarOperator.M ) + throw new IllegalArgumentException("The last Cigar element must be an M"); + + final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1; + final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength()); + if ( matchingSuffix == 0 ) + return 0; + + final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0); + + // there is an important edge condition that we need to handle here: Smith-Waterman correctly calculates that there is a + // deletion, that deletion is left-aligned such that the LCA node is part of that deletion, and the rest of the dangling + // tail is a perfect match to the suffix of the reference path. In this case we need to push the reference index to merge + // down one position so that we don't incorrectly cut a base off of the deletion. + final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D; + final boolean mustHandleLeadingDeletionCase = firstElementIsDeletion && (elements.get(0).getLength() + matchingSuffix == lastRefIndex + 1); + final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (mustHandleLeadingDeletionCase ? 1 : 0); + + addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1)); + + return 1; + } + + /** + * Actually merge the dangling head if possible + * + * @param danglingHeadMergeResult the result from generating a Cigar for the dangling head against the reference + * @return 1 if merge was successful, 0 otherwise + */ + protected int mergeDanglingHead(final DanglingChainMergeHelper danglingHeadMergeResult) { + + final List elements = danglingHeadMergeResult.cigar.getCigarElements(); + final CigarElement firstElement = elements.get(0); + if ( firstElement.getOperator() != CigarOperator.M ) + throw new IllegalArgumentException("The first Cigar element must be an M"); + + final int indexesToMerge = bestPrefixMatch(danglingHeadMergeResult.referencePathString, danglingHeadMergeResult.danglingPathString, firstElement.getLength()); + if ( indexesToMerge <= 0 ) + return 0; + + // we can't push back the reference path + if ( indexesToMerge >= danglingHeadMergeResult.referencePath.size() - 1 ) + return 0; + + // but we can manipulate the dangling path if we need to + if ( indexesToMerge >= danglingHeadMergeResult.danglingPath.size() && + ! extendDanglingPathAgainstReference(danglingHeadMergeResult, indexesToMerge - danglingHeadMergeResult.danglingPath.size() + 2) ) + return 0; + + addEdge(danglingHeadMergeResult.referencePath.get(indexesToMerge+1), danglingHeadMergeResult.danglingPath.get(indexesToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1)); + + return 1; + } + + /** + * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the + * provided vertex is the sink) and the reference path. + * + * @param vertex the sink of the dangling chain + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return a SmithWaterman object which can be null if no proper alignment could be generated + */ + protected DanglingChainMergeHelper generateCigarAgainstDownwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) { + + // find the lowest common ancestor path between vertex and the reference sink if available + final List altPath = findPathUpwardsToLowestCommonAncestorOfReference(vertex, pruneFactor); + if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH ) + return null; + + // now get the reference path from the LCA + final List refPath = getReferencePath(altPath.get(0), TraversalDirection.downwards); + + // create the Smith-Waterman strings to use + final byte[] refBases = getBasesForPath(refPath, false); + final byte[] altBases = getBasesForPath(altPath, false); + + // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) + final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL); + return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); + } + + /** + * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the + * provided vertex is the source) and the reference path. + * + * @param vertex the source of the dangling head + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return a SmithWaterman object which can be null if no proper alignment could be generated + */ + protected DanglingChainMergeHelper generateCigarAgainstUpwardsReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) { + + // find the highest common descendant path between vertex and the reference source if available + final List altPath = findPathDownwardsToHighestCommonDescendantOfReference(vertex, pruneFactor); + if ( altPath == null || isRefSink(altPath.get(0)) ) + return null; + + // now get the reference path from the LCA + final List refPath = getReferencePath(altPath.get(0), TraversalDirection.upwards); + + // create the Smith-Waterman strings to use + final byte[] refBases = getBasesForPath(refPath, true); + final byte[] altBases = getBasesForPath(altPath, true); + + // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) + final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL); + return new DanglingChainMergeHelper(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); + } + + /** + * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex. + * Note that nodes are excluded if their pruning weight is less than the pruning factor. + * + * @param vertex the original vertex + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or + * has an ancestor with multiple incoming edges before hitting the reference path + */ + protected List findPathUpwardsToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) { + final LinkedList path = new LinkedList<>(); + + MultiDeBruijnVertex v = vertex; + while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) { + final MultiSampleEdge edge = incomingEdgeOf(v); + // if it has too low a weight, don't use it (or previous vertexes) for the path + if ( edge.getPruningMultiplicity() < pruneFactor ) + path.clear(); + // otherwise it is safe to use + else + path.addFirst(v); + v = getEdgeSource(edge); + } + path.addFirst(v); + + return isReferenceNode(v) ? path : null; + } + + /** + * Finds the path downwards in the graph from this vertex to the reference sequence, including the highest common descendant vertex. + * However note that the path is reversed so that this vertex ends up at the end of the path. + * Also note that nodes are excluded if their pruning weight is less than the pruning factor. + * + * @param vertex the original vertex + * @param pruneFactor the prune factor to use in ignoring chain pieces + * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or + * has a descendant with multiple outgoing edges before hitting the reference path + */ + protected List findPathDownwardsToHighestCommonDescendantOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) { + final LinkedList path = new LinkedList<>(); + + MultiDeBruijnVertex v = vertex; + while ( ! isReferenceNode(v) && outDegreeOf(v) == 1 ) { + final MultiSampleEdge edge = outgoingEdgeOf(v); + // if it has too low a weight, don't use it (or previous vertexes) for the path + if ( edge.getPruningMultiplicity() < pruneFactor ) + path.clear(); + // otherwise it is safe to use + else + path.addFirst(v); + v = getEdgeTarget(edge); + } + path.addFirst(v); + + return isReferenceNode(v) ? path : null; + } + + private enum TraversalDirection { + downwards, + upwards + } + + /** + * Finds the path in the graph from this vertex to the reference sink, including this vertex + * + * @param start the reference vertex to start from + * @param direction describes which direction to move in the graph (i.e. down to the reference sink or up to the source) + * @return the path (non-null, non-empty) + */ + protected List getReferencePath(final MultiDeBruijnVertex start, final TraversalDirection direction) { + if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path"); + + final List path = new ArrayList<>(); + + MultiDeBruijnVertex v = start; + while ( v != null ) { + path.add(v); + v = (direction == TraversalDirection.downwards ? getNextReferenceVertex(v) : getPrevReferenceVertex(v)); + } + + return path; + } + + /** + * The base sequence for the given path. + * + * @param path the list of vertexes that make up the path + * @param reverseIfSource if true and if we encounter a source node, then reverse the character sequence for that node + * @return non-null sequence of bases corresponding to the given path + */ + @Ensures({"result != null"}) + public byte[] getBasesForPath(final List path, final boolean reverseIfSource) { + if ( path == null ) throw new IllegalArgumentException("Path cannot be null"); + + final StringBuilder sb = new StringBuilder(); + for ( final MultiDeBruijnVertex v : path ) { + if ( isSource(v) ) { + final String seq = v.getSequenceString(); + sb.append(reverseIfSource ? new StringBuilder(seq).reverse().toString() : seq); + } else { + sb.append((char)v.getSuffix()); + } + } + + return sb.toString().getBytes(); + } + + /** + * Finds the index of the best extent of the prefix match between the provided paths, for dangling head merging. + * Assumes that path1.length >= maxIndex and path2.length >= maxIndex. + * + * @param path1 the first path + * @param path2 the second path + * @param maxIndex the maximum index to traverse (not inclusive) + * @return the index of the ideal prefix match or -1 if it cannot find one, must be less than maxIndex + */ + protected static int bestPrefixMatch(final byte[] path1, final byte[] path2, final int maxIndex) { + int mismatches = 0; + int index = 0; + int lastGoodIndex = -1; + while ( index < maxIndex ) { + if ( path1[index] != path2[index] ) { + if ( ++mismatches > MAXIMUM_MISMATCHES_IN_DANGLING_HEAD_MERGE ) + return lastGoodIndex; + lastGoodIndex = index; + } + index++; + } + // if we got here then we hit the max index + return lastGoodIndex; + } + + protected boolean extendDanglingPathAgainstReference(final DanglingChainMergeHelper danglingHeadMergeResult, final int numNodesToExtend) { + + final int indexOfLastDanglingNode = danglingHeadMergeResult.danglingPath.size() - 1; + final int indexOfRefNodeToUse = indexOfLastDanglingNode + numNodesToExtend; + if ( indexOfRefNodeToUse >= danglingHeadMergeResult.referencePath.size() ) + return false; + + final MultiDeBruijnVertex danglingSource = danglingHeadMergeResult.danglingPath.remove(indexOfLastDanglingNode); + final StringBuilder sb = new StringBuilder(); + final byte[] refSourceSequence = danglingHeadMergeResult.referencePath.get(indexOfRefNodeToUse).getSequence(); + for ( int i = 0; i < numNodesToExtend; i++ ) + sb.append((char)refSourceSequence[i]); + sb.append(danglingSource.getSequenceString()); + final byte[] sequenceToExtend = sb.toString().getBytes(); + + // clean up the source and edge + final MultiSampleEdge sourceEdge = outgoingEdgeOf(danglingSource); + MultiDeBruijnVertex prevV = getEdgeTarget(sourceEdge); + removeEdge(danglingSource, prevV); + + // extend the path + for ( int i = numNodesToExtend; i > 0; i-- ) { + final MultiDeBruijnVertex newV = new MultiDeBruijnVertex(Arrays.copyOfRange(sequenceToExtend, i, i+kmerSize)); + addVertex(newV); + final MultiSampleEdge newE = addEdge(newV, prevV); + newE.setMultiplicity(sourceEdge.getMultiplicity()); + danglingHeadMergeResult.danglingPath.add(newV); + prevV = newV; + } + + return true; + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java index 6574e8295..150cdc826 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/HaplotypeGraph.java @@ -391,10 +391,9 @@ public class HaplotypeGraph extends ReadThreadingGraph { graphWriter.println("}"); } - @Override - public Pair findStart(final SequenceForKmers seqForKmers) { - return getOrCreateKmerVertex(seqForKmers.sequence, 0, true); + protected int findStart(final SequenceForKmers seqForKmers) { + return 0; } /** diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index e158ef613..a932f8a96 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -186,9 +186,10 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1 rtgraph.pruneLowWeightChains(pruneFactor); - // look at all chains in the graph that terminate in a non-ref node (dangling sinks) and see if + // look at all chains in the graph that terminate in a non-ref node (dangling sources and sinks) and see if // we can recover them by merging some N bases from the chain back into the reference if ( recoverDanglingTails ) rtgraph.recoverDanglingTails(pruneFactor); + if ( recoverDanglingHeads ) rtgraph.recoverDanglingHeads(pruneFactor); // remove all heading and trailing paths if ( removePathsNotConnectedToRef ) rtgraph.removePathsNotConnectedToRef(); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java index 7fdfa4301..a7989ac2c 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraph.java @@ -46,21 +46,12 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; -import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.KMerCounter; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.Kmer; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import org.broadinstitute.sting.utils.smithwaterman.SWParameterSet; -import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman; -import org.jgrapht.EdgeFactory; import org.jgrapht.alg.CycleDetector; import java.io.File; @@ -68,28 +59,7 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -public class ReadThreadingGraph extends BaseGraph implements KmerSearchableGraph { - - /** - * Edge factory that encapsulates the numPruningSamples assembly parameter - */ - private static class MyEdgeFactory implements EdgeFactory { - final int numPruningSamples; - - public MyEdgeFactory(int numPruningSamples) { - this.numPruningSamples = numPruningSamples; - } - - @Override - public MultiSampleEdge createEdge(final MultiDeBruijnVertex sourceVertex, final MultiDeBruijnVertex targetVertex) { - return new MultiSampleEdge(false, 1, numPruningSamples); - } - - public MultiSampleEdge createEdge(final boolean isRef, final int multiplicity) { - return new MultiSampleEdge(isRef, multiplicity, numPruningSamples); - } - - } +public class ReadThreadingGraph extends DanglingChainMergingGraph implements KmerSearchableGraph { private final static Logger logger = Logger.getLogger(ReadThreadingGraph.class); @@ -97,9 +67,6 @@ public class ReadThreadingGraph extends BaseGraph startingInfo = findStart(seqForKmers); - if ( startingInfo == null ) + final int uniqueStartPos = findStart(seqForKmers); + if ( uniqueStartPos == -1 ) return; - final MultiDeBruijnVertex startingVertex = startingInfo.getFirst(); - final int uniqueStartPos = startingInfo.getSecond(); + final MultiDeBruijnVertex startingVertex = getOrCreateKmerVertex(seqForKmers.sequence, uniqueStartPos); // increase the counts of all edges incoming into the starting vertex supported by going back in sequence if ( increaseCountsBackwards ) @@ -278,177 +243,22 @@ public class ReadThreadingGraph extends BaseGraph danglingPath, referencePath; - final byte[] danglingPathString, referencePathString; - final Cigar cigar; - - public DanglingTailMergeResult(final List danglingPath, - final List referencePath, - final byte[] danglingPathString, - final byte[] referencePathString, - final Cigar cigar) { - this.danglingPath = danglingPath; - this.referencePath = referencePath; - this.danglingPathString = danglingPathString; - this.referencePathString = referencePathString; - this.cigar = cigar; - } - } - - /** - * Attempt to attach vertex with out-degree == 0 to the graph + * Find vertex and its position in seqForKmers where we should start assembling seqForKmers * - * @param vertex the vertex to recover - * @param pruneFactor the prune factor to use in ignoring chain pieces - * @return 1 if we successfully recovered the vertex and 0 otherwise + * @param seqForKmers the sequence we want to thread into the graph + * @return the position of the starting vertex in seqForKmer, or -1 if it cannot find one */ - protected int recoverDanglingChain(final MultiDeBruijnVertex vertex, final int pruneFactor) { - if ( outDegreeOf(vertex) != 0 ) throw new IllegalStateException("Attempting to recover a dangling tail for " + vertex + " but it has out-degree > 0"); - - // generate the CIGAR string from Smith-Waterman between the dangling tail and reference paths - final DanglingTailMergeResult danglingTailMergeResult = generateCigarAgainstReferencePath(vertex, pruneFactor); - - // if the CIGAR is too complex (or couldn't be computed) then we do not allow the merge into the reference path - if ( danglingTailMergeResult == null || ! cigarIsOkayToMerge(danglingTailMergeResult.cigar) ) + protected int findStart(final SequenceForKmers seqForKmers) { + if ( seqForKmers.isRef ) return 0; - // merge - return mergeDanglingTail(danglingTailMergeResult); - } - - /** - * Determine whether the provided cigar is okay to merge into the reference path - * - * @param cigar the cigar to analyze - * @return true if it's okay to merge, false otherwise - */ - protected boolean cigarIsOkayToMerge(final Cigar cigar) { - - final List elements = cigar.getCigarElements(); - final int numElements = elements.size(); - - // don't allow more than a couple of different ops - if ( numElements > MAX_CIGAR_COMPLEXITY ) - return false; - - // the last element must be an M - if ( elements.get(numElements - 1).getOperator() != CigarOperator.M ) - return false; - - // TODO -- do we want to check whether the Ms mismatch too much also? - - return true; - } - - /** - * Actually merge the dangling tail if possible - * - * @param danglingTailMergeResult the result from generating a Cigar for the dangling tail against the reference - * @return 1 if merge was successful, 0 otherwise - */ - protected int mergeDanglingTail(final DanglingTailMergeResult danglingTailMergeResult) { - - final List elements = danglingTailMergeResult.cigar.getCigarElements(); - final CigarElement lastElement = elements.get(elements.size() - 1); - if ( lastElement.getOperator() != CigarOperator.M ) - throw new IllegalArgumentException("The last Cigar element must be an M"); - - final int lastRefIndex = danglingTailMergeResult.cigar.getReferenceLength() - 1; - final int matchingSuffix = Math.min(GraphUtils.longestSuffixMatch(danglingTailMergeResult.referencePathString, danglingTailMergeResult.danglingPathString, lastRefIndex), lastElement.getLength()); - if ( matchingSuffix == 0 ) - return 0; - - final int altIndexToMerge = Math.max(danglingTailMergeResult.cigar.getReadLength() - matchingSuffix - 1, 0); - - // there is an important edge condition that we need to handle here: Smith-Waterman correctly calculates that there is a - // deletion, that deletion is left-aligned such that the LCA node is part of that deletion, and the rest of the dangling - // tail is a perfect match to the suffix of the reference path. In this case we need to push the reference index to merge - // down one position so that we don't incorrectly cut a base off of the deletion. - final boolean firstElementIsDeletion = elements.get(0).getOperator() == CigarOperator.D; - final boolean mustHandleLeadingDeletionCase = firstElementIsDeletion && (elements.get(0).getLength() + matchingSuffix == lastRefIndex + 1); - final int refIndexToMerge = lastRefIndex - matchingSuffix + 1 + (mustHandleLeadingDeletionCase ? 1 : 0); - - addEdge(danglingTailMergeResult.danglingPath.get(altIndexToMerge), danglingTailMergeResult.referencePath.get(refIndexToMerge), ((MyEdgeFactory)getEdgeFactory()).createEdge(false, 1)); - - return 1; - } - - /** - * Generates the CIGAR string from the Smith-Waterman alignment of the dangling path (where the - * provided vertex is the sink) and the reference path. - * - * @param vertex the sink of the dangling tail - * @param pruneFactor the prune factor to use in ignoring chain pieces - * @return a SmithWaterman object which can be null if no proper alignment could be generated - */ - protected DanglingTailMergeResult generateCigarAgainstReferencePath(final MultiDeBruijnVertex vertex, final int pruneFactor) { - - // find the lowest common ancestor path between vertex and the reference sink if available - final List altPath = findPathToLowestCommonAncestorOfReference(vertex, pruneFactor); - if ( altPath == null || isRefSource(altPath.get(0)) || altPath.size() < MIN_DANGLING_TAIL_LENGTH ) - return null; - - // now get the reference path from the LCA - final List refPath = getReferencePath(altPath.get(0)); - - // create the Smith-Waterman strings to use - final byte[] refBases = getBasesForPath(refPath); - final byte[] altBases = getBasesForPath(altPath); - - // run Smith-Waterman to determine the best alignment (and remove trailing deletions since they aren't interesting) - final SmithWaterman alignment = new SWPairwiseAlignment(refBases, altBases, SWParameterSet.STANDARD_NGS, SWPairwiseAlignment.OVERHANG_STRATEGY.LEADING_INDEL); - return new DanglingTailMergeResult(altPath, refPath, altBases, refBases, AlignmentUtils.removeTrailingDeletions(alignment.getCigar())); - } - - /** - * Finds the path upwards in the graph from this vertex to the reference sequence, including the lowest common ancestor vertex. - * Note that nodes are excluded if their pruning weight is less than the pruning factor. - * - * @param vertex the original vertex - * @param pruneFactor the prune factor to use in ignoring chain pieces - * @return the path if it can be determined or null if this vertex either doesn't merge onto the reference path or - * has an ancestor with multiple incoming edges before hitting the reference path - */ - protected List findPathToLowestCommonAncestorOfReference(final MultiDeBruijnVertex vertex, final int pruneFactor) { - final LinkedList path = new LinkedList<>(); - - MultiDeBruijnVertex v = vertex; - while ( ! isReferenceNode(v) && inDegreeOf(v) == 1 ) { - final MultiSampleEdge edge = incomingEdgeOf(v); - // if it has too low a weight, don't use it (or previous vertexes) for the path - if ( edge.getPruningMultiplicity() < pruneFactor ) - path.clear(); - // otherwise it is safe to use - else - path.addFirst(v); - v = getEdgeSource(edge); - } - path.addFirst(v); - - return isReferenceNode(v) ? path : null; - } - - /** - * Finds the path downwards in the graph from this vertex to the reference sink, including this vertex - * - * @param start the reference vertex to start from - * @return the path (non-null, non-empty) - */ - protected List getReferencePath(final MultiDeBruijnVertex start) { - if ( ! isReferenceNode(start) ) throw new IllegalArgumentException("Cannot construct the reference path from a vertex that is not on that path"); - - final List path = new ArrayList<>(); - - MultiDeBruijnVertex v = start; - while ( v != null ) { - path.add(v); - v = getNextReferenceVertex(v); + for ( int i = seqForKmers.start; i < seqForKmers.stop - kmerSize; i++ ) { + final Kmer kmer1 = new Kmer(seqForKmers.sequence, i, kmerSize); + if ( !nonUniqueKmers.contains(kmer1) ) + return i; } - return path; + return -1; } /** @@ -526,26 +336,6 @@ public class ReadThreadingGraph extends BaseGraph uniqueKmers.size(); } - /** - * Try to recover dangling tails - * - * @param pruneFactor the prune factor to use in ignoring chain pieces - */ - public void recoverDanglingTails(final int pruneFactor) { - if ( ! alreadyBuilt ) throw new IllegalStateException("recoverDanglingTails requires the graph be already built"); - - int attempted = 0; - int nRecovered = 0; - for ( final MultiDeBruijnVertex v : vertexSet() ) { - if ( outDegreeOf(v) == 0 && ! isRefSink(v) ) { - attempted++; - nRecovered += recoverDanglingChain(v, pruneFactor); - } - } - - if ( debugGraphTransformations ) logger.info("Recovered " + nRecovered + " of " + attempted + " dangling tails"); - } - /** structure that keeps track of the non-unique kmers for a given kmer size */ private static class NonUniqueResult { final Set nonUniques; @@ -568,7 +358,7 @@ public class ReadThreadingGraph extends BaseGraph withNonUniques = getAllPendingSequences(); - final Set nonUniqueKmers = new HashSet(); + final Set nonUniqueKmers = new HashSet<>(); // go through the sequences and determine which kmers aren't unique within each read int kmerSize = minKmerSize; @@ -606,7 +396,7 @@ public class ReadThreadingGraph extends BaseGraph getAllPendingSequences() { - final LinkedList result = new LinkedList(); + final LinkedList result = new LinkedList<>(); for ( final List oneSampleWorth : pending.values() ) result.addAll(oneSampleWorth); return result; } @@ -642,7 +432,7 @@ public class ReadThreadingGraph extends BaseGraph vertexMap = new HashMap(); + final Map vertexMap = new HashMap<>(); // create all of the equivalent seq graph vertices @@ -683,52 +473,16 @@ public class ReadThreadingGraph extends BaseGraph findStart(final SequenceForKmers seqForKmers) { - final int uniqueStartPos = seqForKmers.isRef ? 0 : findUniqueStartPosition(seqForKmers.sequence, seqForKmers.start, seqForKmers.stop); - - if ( uniqueStartPos == -1 ) - return null; - - return getOrCreateKmerVertex(seqForKmers.sequence, uniqueStartPos, true); - } - - /** - * Find a starting point in sequence that begins a unique kmer among all kmers in the graph - * @param sequence the sequence of bases - * @param start the first base to use in sequence - * @param stop the last base to use in sequence - * @return the index into sequence that begins a unique kmer of size kmerSize, or -1 if none could be found - */ - private int findUniqueStartPosition(final byte[] sequence, final int start, final int stop) { - for ( int i = start; i < stop - kmerSize; i++ ) { - final Kmer kmer1 = new Kmer(sequence, i, kmerSize); - if ( uniqueKmers.containsKey(kmer1) ) - return i; - } - return -1; - } - /** * Get the vertex for the kmer in sequence starting at start * @param sequence the sequence * @param start the position of the kmer start - * @param allowRefSource if true, we will allow matches to the kmer that represents the reference starting kmer * @return a non-null vertex */ - protected Pair getOrCreateKmerVertex(final byte[] sequence, final int start, final boolean allowRefSource) { + private MultiDeBruijnVertex getOrCreateKmerVertex(final byte[] sequence, final int start) { final Kmer kmer = new Kmer(sequence, start, kmerSize); - final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, allowRefSource); - if ( vertex != null ) { - return new Pair<>(vertex, start); - } else { - return new Pair<>(createVertex(kmer), start); - } + final MultiDeBruijnVertex vertex = getUniqueKmerVertex(kmer, true); + return ( vertex != null ) ? vertex : createVertex(kmer); } /** @@ -878,11 +632,11 @@ public class ReadThreadingGraph extends BaseGraph * Note: only used for testing. - * Checkout {@link HaplotypeGraphUnitTest} for examples. + * Checkout {@see HaplotypeGraphUnitTest} for examples. *

* @param s the string representation of the graph {@code null}. */ @@ -913,7 +667,7 @@ public class ReadThreadingGraph extends BaseGraph * Note: this is done just for testing purposes. - * Checkout {@link HaplotypeGraphUnitTest} for examples. + * Checkout {@see HaplotypeGraphUnitTest} for examples. *

* @param str the string representation. */ @@ -934,9 +688,9 @@ public class ReadThreadingGraph extends BaseGraph expectedSet = expected == null ? Collections.emptySet() : new HashSet(Arrays.asList(expected)); Assert.assertEquals(actualSet, expectedSet); } - - @Test(enabled = true) - public void testGetBases() { - - final int kmerSize = 4; - final String testString = "AATGGGGGCAATACTA"; - - final List vertexes = new ArrayList<>(); - for ( int i = 0; i <= testString.length() - kmerSize; i++ ) { - vertexes.add(new DeBruijnVertex(testString.substring(i, i + kmerSize))); - } - - final String result = new String(new TestGraph().getBasesForPath(vertexes)); - Assert.assertEquals(result, testString.substring(kmerSize - 1)); - } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java new file mode 100644 index 000000000..f45dbd093 --- /dev/null +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java @@ -0,0 +1,230 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class DanglingChainMergingGraphUnitTest extends BaseTest { + + public static byte[] getBytes(final String alignment) { + return alignment.replace("-","").getBytes(); + } + + @DataProvider(name = "DanglingTails") + public Object[][] makeDanglingTailsData() { + List tests = new ArrayList<>(); + + // add 1M to the expected CIGAR because it includes the previous (common) base too + tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype + tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion + tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion + tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp + tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps + tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment + tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data + tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch + tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DanglingTails") + public void testDanglingTails(final String refEnd, + final String altEnd, + final String cigar, + final boolean cigarIsGood, + final int mergePointDistanceFromSink) { + + final int kmerSize = 15; + + // construct the haplotypes + final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; + final String ref = commonPrefix + refEnd; + final String alt = commonPrefix + altEnd; + + // create the graph and populate it + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); + rtgraph.addSequence("ref", ref.getBytes(), true); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); + rtgraph.addRead(read); + rtgraph.buildGraphIfNecessary(); + + // confirm that we have just a single dangling tail + MultiDeBruijnVertex altSink = null; + for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { + if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) { + Assert.assertTrue(altSink == null, "We found more than one non-reference sink"); + altSink = v; + } + } + + Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); + + // confirm that the SW alignment agrees with our expectations + final ReadThreadingGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstDownwardsReferencePath(altSink, 0); + + if ( result == null ) { + Assert.assertFalse(cigarIsGood); + return; + } + + Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); + + // confirm that the goodness of the cigar agrees with our expectations + Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar, false, true), cigarIsGood); + + // confirm that the tail merging works as expected + if ( cigarIsGood ) { + final int mergeResult = rtgraph.mergeDanglingTail(result); + Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1); + + // confirm that we created the appropriate edge + if ( mergePointDistanceFromSink >= 0 ) { + MultiDeBruijnVertex v = altSink; + for ( int i = 0; i < mergePointDistanceFromSink; i++ ) { + if ( rtgraph.inDegreeOf(v) != 1 ) + Assert.fail("Encountered vertex with multiple edges"); + v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v)); + } + Assert.assertTrue(rtgraph.outDegreeOf(v) > 1); + } + } + } + + @Test(enabled = true) + public void testGetBasesForPath() { + + final int kmerSize = 4; + final String testString = "AATGGGGCAATACTA"; + + final ReadThreadingGraph graph = new ReadThreadingGraph(kmerSize); + graph.addSequence(testString.getBytes(), true); + graph.buildGraphIfNecessary(); + + final List vertexes = new ArrayList<>(); + MultiDeBruijnVertex v = graph.getReferenceSourceVertex(); + while ( v != null ) { + vertexes.add(v); + v = graph.getNextReferenceVertex(v); + } + + final String result = new String(graph.getBasesForPath(vertexes, false)); + Assert.assertEquals(result, testString); + } + + @DataProvider(name = "DanglingHeads") + public Object[][] makeDanglingHeadsData() { + List tests = new ArrayList<>(); + + // add 1M to the expected CIGAR because it includes the last (common) base too + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "AAYCGGTTACGT", "8M", true}); // 1 snp + tests.add(new Object[]{"XXXAACCGGTTACGT", "XAAACCGGTTACGT", "7M", false}); // 1 snp + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "XAACGGTTACGT", "4M1D4M", false}); // deletion + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "AYYCGGTTACGT", "8M", true}); // 2 snps + tests.add(new Object[]{"XXXXXXXAACCGGTTACGTAA", "AYCYGGTTACGTAA", "9M", true}); // 2 snps + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "AYCGGTTACGT", "7M", true}); // very little data + tests.add(new Object[]{"XXXXXXXAACCGGTTACGT", "YCCGGTTACGT", "6M", true}); // begins in mismatch + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "DanglingHeads") + public void testDanglingHeads(final String ref, + final String alt, + final String cigar, + final boolean shouldBeMerged) { + + final int kmerSize = 5; + + // create the graph and populate it + final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); + rtgraph.addSequence("ref", ref.getBytes(), true); + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); + rtgraph.addRead(read); + rtgraph.buildGraphIfNecessary(); + + // confirm that we have just a single dangling head + MultiDeBruijnVertex altSource = null; + for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { + if ( rtgraph.isSource(v) && !rtgraph.isReferenceNode(v) ) { + Assert.assertTrue(altSource == null, "We found more than one non-reference source"); + altSource = v; + } + } + + Assert.assertTrue(altSource != null, "We did not find a non-reference source"); + + // confirm that the SW alignment agrees with our expectations + final ReadThreadingGraph.DanglingChainMergeHelper result = rtgraph.generateCigarAgainstUpwardsReferencePath(altSource, 0); + + if ( result == null ) { + Assert.assertFalse(shouldBeMerged); + return; + } + + Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); + + // confirm that the tail merging works as expected + final int mergeResult = rtgraph.mergeDanglingHead(result); + Assert.assertTrue(mergeResult > 0 || !shouldBeMerged); + + // confirm that we created the appropriate bubble in the graph only if expected + rtgraph.cleanNonRefPaths(); + final SeqGraph seqGraph = rtgraph.convertToSequenceGraph(); + List> paths = new KBestPaths().getKBestPaths(seqGraph, seqGraph.getReferenceSourceVertex(), seqGraph.getReferenceSinkVertex()); + Assert.assertEquals(paths.size(), shouldBeMerged ? 2 : 1); + } +} diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java index 9172b6454..5b01a1d85 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java @@ -150,6 +150,29 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest { assertSingleBubble(assembler, ref, "ACAGCTGA"); } + @Test(enabled = ! DEBUG) + public void testMismatchInFirstKmer() { + final TestAssembler assembler = new TestAssembler(3); + final String ref = "ACAACTGA"; + final String alt = "AGCTGA"; + assembler.addSequence(ref.getBytes(), true); + assembler.addSequence(alt.getBytes(), false); + + final SeqGraph graph = assembler.assemble(); + graph.simplifyGraph(); + graph.removeSingletonOrphanVertices(); + final Set sources = graph.getSources(); + final Set sinks = graph.getSinks(); + + Assert.assertEquals(sources.size(), 1); + Assert.assertEquals(sinks.size(), 1); + Assert.assertNotNull(graph.getReferenceSourceVertex()); + Assert.assertNotNull(graph.getReferenceSinkVertex()); + + final List> paths = new KBestPaths().getKBestPaths(graph); + Assert.assertEquals(paths.size(), 2); + } + @Test(enabled = ! DEBUG) public void testStartInMiddle() { final TestAssembler assembler = new TestAssembler(3); diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index f6e2a106f..8535c186a 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -216,88 +216,6 @@ public class ReadThreadingGraphUnitTest extends BaseTest { Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); } - @DataProvider(name = "DanglingTails") - public Object[][] makeDanglingTailsData() { - List tests = new ArrayList(); - - // add 1M to the expected CIGAR because it includes the previous (common) base too - tests.add(new Object[]{"AAAAAAAAAA", "CAAA", "5M", true, 3}); // incomplete haplotype - tests.add(new Object[]{"AAAAAAAAAA", "CAAAAAAAAAA", "1M1I10M", true, 10}); // insertion - tests.add(new Object[]{"CCAAAAAAAAAA", "AAAAAAAAAA", "1M2D10M", true, 10}); // deletion - tests.add(new Object[]{"AAAAAAAA", "CAAAAAAA", "9M", true, 7}); // 1 snp - tests.add(new Object[]{"AAAAAAAA", "CAAGATAA", "9M", true, 2}); // several snps - tests.add(new Object[]{"AAAAA", "C", "1M4D1M", false, -1}); // funky SW alignment - tests.add(new Object[]{"AAAAA", "CA", "1M3D2M", false, 1}); // very little data - tests.add(new Object[]{"AAAAAAA", "CAAAAAC", "8M", true, -1}); // ends in mismatch - tests.add(new Object[]{"AAAAAA", "CGAAAACGAA", "1M2I4M2I2M", false, 0}); // alignment is too complex - - return tests.toArray(new Object[][]{}); - } - - @Test(dataProvider = "DanglingTails", enabled = !DEBUG) - public void testDanglingTails(final String refEnd, - final String altEnd, - final String cigar, - final boolean cigarIsGood, - final int mergePointDistanceFromSink) { - - final int kmerSize = 15; - - // construct the haplotypes - final String commonPrefix = "AAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTTT"; - final String ref = commonPrefix + refEnd; - final String alt = commonPrefix + altEnd; - - // create the graph and populate it - final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize); - rtgraph.addSequence("ref", ref.getBytes(), true); - final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(alt.getBytes(), Utils.dupBytes((byte) 30, alt.length()), alt.length() + "M"); - rtgraph.addRead(read); - rtgraph.buildGraphIfNecessary(); - - // confirm that we have just a single dangling tail - MultiDeBruijnVertex altSink = null; - for ( final MultiDeBruijnVertex v : rtgraph.vertexSet() ) { - if ( rtgraph.isSink(v) && !rtgraph.isReferenceNode(v) ) { - Assert.assertTrue(altSink == null, "We found more than one non-reference sink"); - altSink = v; - } - } - - Assert.assertTrue(altSink != null, "We did not find a non-reference sink"); - - // confirm that the SW alignment agrees with our expectations - final ReadThreadingGraph.DanglingTailMergeResult result = rtgraph.generateCigarAgainstReferencePath(altSink, 0); - - if ( result == null ) { - Assert.assertFalse(cigarIsGood); - return; - } - - Assert.assertTrue(cigar.equals(result.cigar.toString()), "SW generated cigar = " + result.cigar.toString()); - - // confirm that the goodness of the cigar agrees with our expectations - Assert.assertEquals(rtgraph.cigarIsOkayToMerge(result.cigar), cigarIsGood); - - // confirm that the tail merging works as expected - if ( cigarIsGood ) { - final int mergeResult = rtgraph.mergeDanglingTail(result); - Assert.assertTrue(mergeResult == 1 || mergePointDistanceFromSink == -1); - - // confirm that we created the appropriate edge - if ( mergePointDistanceFromSink >= 0 ) { - MultiDeBruijnVertex v = altSink; - for ( int i = 0; i < mergePointDistanceFromSink; i++ ) { - if ( rtgraph.inDegreeOf(v) != 1 ) - Assert.fail("Encountered vertex with multiple sources"); - v = rtgraph.getEdgeSource(rtgraph.incomingEdgeOf(v)); - } - Assert.assertTrue(rtgraph.outDegreeOf(v) > 1); - } - } - } - - // TODO -- update to use determineKmerSizeAndNonUniques directly // @DataProvider(name = "KmerSizeData") // public Object[][] makeKmerSizeDataProvider() { @@ -340,5 +258,4 @@ public class ReadThreadingGraphUnitTest extends BaseTest { // assertNonUniques(assembler, expectedNonUniques.toArray(new String[]{})); // } - } From 4d69af189e69d084991f84442ff9adb3301561a2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 27 Feb 2014 13:29:10 -0500 Subject: [PATCH 09/18] Minor change: make the --dontUseSoftClippedBases @Advanced instead of @Hidden --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 13403c0ac..91e763a0d 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -459,7 +459,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In @Argument(fullName="debugGraphTransformations", shortName="debugGraphTransformations", doc="If specified, we will write DOT formatted graph files out of the assembler for only this graph size", required = false) protected boolean debugGraphTransformations = false; - @Hidden + @Advanced @Argument(fullName="dontUseSoftClippedBases", shortName="dontUseSoftClippedBases", doc="If specified, we will not analyze soft clipped bases in the reads", required = false) protected boolean dontUseSoftClippedBases = false; From b99bf85ec8ec56e8a420e69c5cb8983968bfb196 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 3 Mar 2014 22:42:56 -0500 Subject: [PATCH 10/18] Fixed bug where dangling tail merging occasionally created a cycle in the graph. Added unit tests to cover this case. Delivers PT#66690470. --- .../readthreading/DanglingChainMergingGraph.java | 10 ++++++++-- .../DanglingChainMergingGraphUnitTest.java | 13 ++++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java index e59d39a97..c696c50ae 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraph.java @@ -97,7 +97,7 @@ public abstract class DanglingChainMergingGraph extends BaseGraph danglingPath, referencePath; final byte[] danglingPathString, referencePathString; final Cigar cigar; @@ -222,7 +222,7 @@ public abstract class DanglingChainMergingGraph extends BaseGraph MAX_CIGAR_COMPLEXITY ) return false; - // the last element must be an M + // the first element must be an M if ( requireFirstElementM && elements.get(0).getOperator() != CigarOperator.M ) return false; @@ -263,6 +263,12 @@ public abstract class DanglingChainMergingGraph extends BaseGraph Date: Thu, 27 Feb 2014 03:04:41 -0500 Subject: [PATCH 11/18] Added a more efficient implementation of the KBest haplotype finder code. Story: https://www.pivotaltracker.com/story/show/66238286 Changes: 1. Created a new k-best haplotype search implementation in class KBestHaplotypeFinder. 2. Changed HC code to use the new implementation. This seems to fix the original problem without causing significant changes in outputs using some empirical data test cases 3. Moved haplotype's cigar calculation code from Path to CigarUtils; need that in order to gain independence from Path in some parts of the code. In any case that seems like a more natural location for that functionality. --- .../haplotypecaller/LocalAssemblyEngine.java | 17 +- .../DeadEndKBestSubHaplotypeFinder.java | 80 ++++++ .../graphs/EmptyPathHaplotypeFinder.java | 125 +++++++++ .../graphs/KBestHaplotype.java | 161 ++++++++++++ .../graphs/KBestHaplotypeFinder.java | 245 ++++++++++++++++++ .../graphs/KBestSubHaplotypeFinder.java | 72 +++++ .../walkers/haplotypecaller/graphs/Path.java | 103 +------- .../graphs/RecursiveSubHaplotypeFinder.java | 242 +++++++++++++++++ .../HaplotypeBAMWriter.java | 9 +- .../graphs/KBestPathsUnitTest.java | 9 +- .../sting/utils/sam/CigarUtils.java | 105 ++++++++ 11 files changed, 1054 insertions(+), 114 deletions(-) create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index d49827405..8dfeed987 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -54,8 +54,9 @@ import net.sf.samtools.CigarOperator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.VariantContext; @@ -220,12 +221,13 @@ public abstract class LocalAssemblyEngine { final SeqVertex source = graph.getReferenceSourceVertex(); final SeqVertex sink = graph.getReferenceSinkVertex(); if ( source == null || sink == null ) throw new IllegalArgumentException("Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph "+ graph); - - final KBestPaths pathFinder = new KBestPaths<>(allowCyclesInKmerGraphToGeneratePaths); - for ( final Path path : pathFinder.getKBestPaths(graph, numBestHaplotypesPerGraph, source, sink) ) { - Haplotype h = new Haplotype( path.getBases() ); + final KBestHaplotypeFinder haplotypeFinder = new KBestHaplotypeFinder(graph,source,sink); + final Iterator bestHaplotypes = haplotypeFinder.iterator(numBestHaplotypesPerGraph); + while (bestHaplotypes.hasNext()) { + final KBestHaplotype kBestHaplotype = bestHaplotypes.next(); + final Haplotype h = kBestHaplotype.haplotype(); if( !returnHaplotypes.contains(h) ) { - final Cigar cigar = path.calculateCigar(refHaplotype.getBases()); + final Cigar cigar = CigarUtils.calculateCigar(refHaplotype.getBases(),h.getBases()); if ( cigar == null ) { // couldn't produce a meaningful alignment of haplotype to reference, fail quietly @@ -239,12 +241,11 @@ public abstract class LocalAssemblyEngine { } else if( cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength() ) { // SW failure throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength() - + " ref = " + refHaplotype + " path " + new String(path.getBases())); + + " ref = " + refHaplotype + " path " + new String(h.getBases())); } h.setCigar(cigar); h.setAlignmentStartHapwrtRef(activeRegionStart); - h.setScore(path.getScore()); h.setGenomeLocation(activeRegionWindow); returnHaplotypes.add(h); assemblyResultSet.add(h, assemblyResultByGraph.get(graph)); diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java new file mode 100644 index 000000000..0a29bd08b --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java @@ -0,0 +1,80 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** + * Represent a trivial k-best sub haplotype finder with no solutions. + * + *

To be used at vertices that do not have any valid path to the requested sink node

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +final class DeadEndKBestSubHaplotypeFinder implements KBestSubHaplotypeFinder { + + /** + * Sole instance of this class. + */ + public static DeadEndKBestSubHaplotypeFinder INSTANCE = new DeadEndKBestSubHaplotypeFinder(); + + /** + * Prevents instantiation of more than one instance; please use {@link #INSTANCE}. + */ + protected DeadEndKBestSubHaplotypeFinder() { + } + + @Override + public int getCount() { + return 0; + } + + @Override + public KBestHaplotype getKBest(int k) { + if (k < 0) + throw new IllegalArgumentException("k cannot be negative"); + else + throw new IllegalArgumentException("k cannot be equal or greater to the haplotype count"); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java new file mode 100644 index 000000000..aa1f213fe --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java @@ -0,0 +1,125 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** + * Trivial k-best sub-haplotype finder where the source and sink vertex are the same one. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder { + + + /** + * Caches the only solution returned by this finder. + */ + private final KBestHaplotype singleHaplotypePath; + + public EmptyPathHaplotypeFinderNode(final SeqGraph graph, final SeqVertex sink) { + singleHaplotypePath = new MyBestHaplotypePath(graph,sink); + } + + @Override + public int getCount() { + return 1; + } + + @Override + public KBestHaplotype getKBest(int k) { + if (k < 0) + throw new IllegalArgumentException("k cannot be negative"); + if (k > 0) + throw new IllegalArgumentException("k cannot greater than the possible haplotype count"); + return singleHaplotypePath; + } + + /** + * Custom extension of {@link KBestHaplotype} that implements the single solution behaviour. + */ + private class MyBestHaplotypePath extends KBestHaplotype { + + private final SeqVertex vertex; + + private final SeqGraph graph; + + private Boolean isReference; + + public MyBestHaplotypePath(final SeqGraph graph, final SeqVertex vertex) { + this.vertex = vertex; + this.graph = graph; + } + + @Override + public SeqGraph graph() { + return graph; + } + + @Override + public int score() { + return 0; + } + + @Override + public int rank() { + return 0; + } + + @Override + protected SeqVertex head() { + return vertex; + } + + @Override + protected KBestHaplotype tail() { + return null; + } + + @Override + public boolean isReference() { + return (isReference != null) ? isReference: (isReference = graph.isReferenceNode(vertex)); + } + } +} \ No newline at end of file diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java new file mode 100644 index 000000000..d88c17cbf --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java @@ -0,0 +1,161 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +/** + * Represents a result from a K-best haplotype search. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public abstract class KBestHaplotype { + + /** + * Returns the original graph searched. + * + * @return never {@code null} + */ + public abstract SeqGraph graph(); + + /** + * Returns the result haplotype score. + * + *

Currently, the score is the multiplicity total sum of edges along the haplotype path

+ * + * @return 0 or greater. + */ + public abstract int score(); + + /** + * Indicates whether this result is the reference haplotype. + * + * @return {@code true} if it is the reference haplotype, {@code false} otherwise. + */ + public abstract boolean isReference(); + + /** + * The rank of this solution within the list of solutions that resulted from the same search. + * + *

0 would correspond to the best solution, 1 with the second best and so on

+ * + * @return 0 or greater. + */ + public abstract int rank(); + + private byte[] bases; + + private Haplotype haplotype; + + private Path path; + + /** + * Returns the result haplotype bases. + * + * @return never {@code null}. + */ + public byte[] bases() { + if (bases != null) return bases; + final KBestHaplotype tail = tail(); + final SeqVertex head = head(); + if (tail == null) + bases = head.getSequence(); + else { + final byte[] tailBases = tail.bases(); + final byte[] headBases = head.getSequence(); + final int length = tailBases.length + headBases.length; + bases = new byte[length]; + System.arraycopy(headBases,0,bases,0,headBases.length); + System.arraycopy(tailBases,0,bases,headBases.length,tailBases.length); + } + return bases; + } + + /** + * Returns the solution haplotype. + * + * @return never {@code null}. + */ + public Haplotype haplotype() { + if (haplotype != null) return haplotype; + haplotype = new Haplotype(bases(),isReference()); + haplotype.setScore(score()); + return haplotype; + } + + /** + * Returns the path across the original graph that correspond to the solution haplotype. + * + * @return never {@code null}, although perhaps a zero-length path (only one vertex). + */ + public Path path() { + if (path != null) return path; + final KBestHaplotype tail = tail(); + if (tail == null) + path = new Path<>(head(),graph()); + else { + final Path tailPath = tail.path(); + path = new Path<>(graph().getEdge(head(),tailPath.getFirstVertex()),tailPath); + } + return path; + } + + /** + * The first vertex on the haplotype path. + * + * @return never {@code null}. + */ + protected abstract SeqVertex head(); + + /** + * Returns the sub-haplotype from the second vertex involved in the haplotype until the end. + * + * @return {@code null} if there are no more vertices in the solution path a part from the one returned by {@link #head}. + */ + protected abstract KBestHaplotype tail(); + + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java new file mode 100644 index 000000000..725fcae1a --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java @@ -0,0 +1,245 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import org.jgrapht.alg.CycleDetector; + +import java.util.*; + +/** + * Efficient algorithm to obtain the list of best haplotypes given the {@link SeqGraph instace}. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class KBestHaplotypeFinder extends AbstractList implements Iterable { + + + private final SeqGraph graph; + protected Map nodeBySource; + + protected SeqVertex sink; + protected SeqVertex source; + + /** + * Constructs a new best haplotypes finder. + * + * @param graph the seq-graph to search. + * @param source the source vertex for all haplotypes. + * @param sink the sink vertex for all haplotypes. + * + * @throws IllegalArgumentException if
    + *
  • any of {@code graph}, {@code source} or {@code sink} is {@code null} or
  • + *
  • either {@code source} or {@code sink} is not a vertex of {@code graph}'s.
  • + *
+ */ + public KBestHaplotypeFinder(final SeqGraph graph, final SeqVertex source, final SeqVertex sink) { + if (graph == null) throw new IllegalArgumentException("graph cannot be null"); + if (source == null) throw new IllegalArgumentException("source cannot be null"); + if (sink == null) throw new IllegalArgumentException("sink cannot be null"); + if (!graph.containsVertex(source)) throw new IllegalArgumentException("source does not belong to the graph"); + if (!graph.containsVertex(sink)) throw new IllegalArgumentException("sink does not belong to the graph"); + //TODO dealing with cycles here due to a bug in some of the graph transformations that produces cycles. + //TODO Once that is solve, the conditional above should be removed in order the save time: + //this.graph = graph; + if (new CycleDetector<>(graph).detectCycles()) + this.graph = removeCycles(graph,source,sink); + else + this.graph = graph; + nodeBySource = new HashMap<>(graph.vertexSet().size()); + this.sink = sink; + this.source = source; + } + + private static SeqGraph removeCycles(final SeqGraph original, final SeqVertex source, final SeqVertex sink) { + final Set edgesToRemove = new HashSet<>(original.edgeSet().size()); + final Set vertexToRemove = new HashSet<>(original.vertexSet().size()); + + if (!findGuiltyVerticesAndEdgesToRemoveCycles(original, source, sink, edgesToRemove, vertexToRemove, new HashSet(original.vertexSet().size()))) + throw new IllegalStateException("could not find any path from the source vertex to the sink vertex: " + source + " -> " + sink); + + if (edgesToRemove.isEmpty() && vertexToRemove.isEmpty()) + throw new IllegalStateException("cannot find a way to remove the cycles"); + + final SeqGraph result = (SeqGraph) original.clone(); + result.removeAllEdges(edgesToRemove); + result.removeAllVertices(vertexToRemove); + return result; + } + + private static boolean findGuiltyVerticesAndEdgesToRemoveCycles(final SeqGraph graph, final SeqVertex currentVertex, final SeqVertex sink, + final Set edgesToRemove, final Set verticesToRemove, + final Set parentVertices) { + if (currentVertex.equals(sink)) return true; + + final Set outgoingEdges = graph.outgoingEdgesOf(currentVertex); + boolean reachsSink = false; + parentVertices.add(currentVertex); + + for (final BaseEdge edge : outgoingEdges) { + final SeqVertex child = graph.getEdgeTarget(edge); + if (parentVertices.contains(child)) + edgesToRemove.add(edge); + else { + final boolean childReachSink = findGuiltyVerticesAndEdgesToRemoveCycles(graph, child, sink, edgesToRemove, verticesToRemove, parentVertices); + reachsSink = reachsSink || childReachSink; + } + } + parentVertices.remove(currentVertex); + if (!reachsSink) verticesToRemove.add(currentVertex); + return reachsSink; + } + + + @Override + public KBestHaplotype get(int index) { + final KBestSubHaplotypeFinder sourceNode = createNode(source); + if (index < 0 || index >= size()) + throw new IndexOutOfBoundsException(); + return sourceNode.getKBest(index); + } + + @Override + public Iterator iterator() { + final KBestSubHaplotypeFinder sourceFinder = createNode(source); + + return new Iterator() { + private int nextK = 0; + private final int maxK = sourceFinder.getCount(); + + + @Override + public boolean hasNext() { + return nextK < maxK; + } + + @Override + public KBestHaplotype next() { + if (nextK >= maxK) throw new NoSuchElementException(); + return sourceFinder.getKBest(nextK++); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + @Override + public int size() { + return createNode(source).getCount(); + } + + /** + * Returns an iterator on the first k best haplotypes. + *

+ * It might return less than k haplotypes if the total number of possible haplotypes is smaller. + *

+ * + * @param k the maximum number of haplotypes to return. + * @return never {@code null}, but perhaps a iterator that return no haplotype. + */ + public Iterator iterator(final int k) { + final KBestSubHaplotypeFinder sourceFinder = createNode(source); + + return new Iterator() { + private int nextK = 0; + private final int maxK = Math.min(sourceFinder.getCount(), k); + + + @Override + public boolean hasNext() { + return nextK < maxK; + } + + @Override + public KBestHaplotype next() { + if (nextK >= maxK) throw new NoSuchElementException(); + return sourceFinder.getKBest(nextK++); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + }; + } + + protected KBestSubHaplotypeFinder createNode(final SeqVertex source) { + KBestSubHaplotypeFinder node = nodeBySource.get(source); + if (node == null) { + if (source.equals(sink)) + node = new EmptyPathHaplotypeFinderNode(graph,sink); + else { + final Set outgoingEdges = graph.outgoingEdgesOf(source); + if (outgoingEdges.isEmpty()) + node = DeadEndKBestSubHaplotypeFinder.INSTANCE; + else { + final Map undeadChildren = createChildrenNodes(outgoingEdges); + if (undeadChildren.isEmpty()) + node = DeadEndKBestSubHaplotypeFinder.INSTANCE; + else + node = new RecursiveSubHaplotypeFinder(graph,source,undeadChildren); + + } + } + nodeBySource.put(source,node); + } + return node; + } + + private Map createChildrenNodes(Set baseEdges) { + final Map result = new LinkedHashMap<>(baseEdges.size()); + for (final BaseEdge edge : baseEdges) + result.put(edge,createNode(graph.getEdgeTarget(edge))); + final Iterator> childrenIterator = result.entrySet().iterator(); + while (childrenIterator.hasNext()) + if (childrenIterator.next().getValue().getCount() == 0) + childrenIterator.remove(); + return result; + } + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java new file mode 100644 index 000000000..a637ff2b6 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java @@ -0,0 +1,72 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +/** + * Common interface for K-Best sub-haplotype finders. + * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +interface KBestSubHaplotypeFinder { + + /** + * Returns the total number of possible sub-haplotypes. + * @return 0 or greater. + */ + public abstract int getCount(); + + /** + * Return the k-best sub-haplotype solution. + * + * + * @param k the requested solution rank. + * @throws IllegalArgumentException if {@code k} is outside bounds [0 .. {@link #getCount()}). + * + * @return never {@code null}. + */ + public abstract KBestHaplotype getKBest(int k); + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java index 6901d16ef..e6f460d1a 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/Path.java @@ -48,12 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import com.google.java.contract.Ensures; import net.sf.samtools.Cigar; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; import org.apache.commons.lang.ArrayUtils; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.smithwaterman.*; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; import java.util.*; @@ -68,8 +64,6 @@ import java.util.*; * */ public class Path { - private final static String SW_PAD = "NNNNNNNNNN"; - private final static Logger logger = Logger.getLogger(Path.class); // the last vertex seen in the path protected final T lastVertex; @@ -84,10 +78,6 @@ public class Path { // the graph from which this path originated protected final BaseGraph graph; - // used in the bubble state machine to apply Smith-Waterman to the bubble sequence - // these values were chosen via optimization against the NA12878 knowledge base - public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1); - /** * Create a new Path containing no edges and starting at initialVertex * @param initialVertex the starting vertex of the path @@ -348,96 +338,10 @@ public class Path { * @param refSeq the reference sequence that all of the bases in this path should align to * @return a Cigar mapping this path to refSeq, or null if no reasonable alignment could be found */ - public Cigar calculateCigar(final byte[] refSeq) { - if ( getBases().length == 0 ) { - // horrible edge case from the unit tests, where this path has no bases - return new Cigar(Arrays.asList(new CigarElement(refSeq.length, CigarOperator.D))); - } - - final byte[] bases = getBases(); - final Cigar nonStandard; - - final String paddedRef = SW_PAD + new String(refSeq) + SW_PAD; - final String paddedPath = SW_PAD + new String(bases) + SW_PAD; - final SmithWaterman alignment = new SWPairwiseAlignment( paddedRef.getBytes(), paddedPath.getBytes(), NEW_SW_PARAMETERS ); - - if ( isSWFailure(alignment) ) - return null; - - // cut off the padding bases - final int baseStart = SW_PAD.length(); - final int baseEnd = paddedPath.length() - SW_PAD.length() - 1; // -1 because it's inclusive - nonStandard = AlignmentUtils.trimCigarByBases(alignment.getCigar(), baseStart, baseEnd); - - if ( nonStandard.getReferenceLength() != refSeq.length ) { - nonStandard.add(new CigarElement(refSeq.length - nonStandard.getReferenceLength(), CigarOperator.D)); - } - - // finally, return the cigar with all indels left aligned - return leftAlignCigarSequentially(nonStandard, refSeq, getBases(), 0, 0); + public Cigar calculateCigar(final byte[] refSeq) { + return CigarUtils.calculateCigar(refSeq,getBases()); } - /** - * Make sure that the SW didn't fail in some terrible way, and throw exception if it did - */ - private boolean isSWFailure(final SmithWaterman alignment) { - // check that the alignment starts at the first base, which it should given the padding - if ( alignment.getAlignmentStart2wrt1() > 0 ) { - return true; -// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should always start at 0, but got " + alignment.getAlignmentStart2wrt1() + " with cigar " + alignment.getCigar()); - } - - // check that we aren't getting any S operators (which would be very bad downstream) - for ( final CigarElement ce : alignment.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.S ) - return true; - // soft clips at the end of the alignment are really insertions -// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should never contain S operators but got cigar " + alignment.getCigar()); - } - - return false; - } - - /** - * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them. - * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others. - * @param cigar the cigar to left align - * @param refSeq the reference byte array - * @param readSeq the read byte array - * @param refIndex 0-based alignment start position on ref - * @param readIndex 0-based alignment start position on read - * @return the left-aligned cigar - */ - @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) - protected static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { - final Cigar cigarToReturn = new Cigar(); - Cigar cigarToAlign = new Cigar(); - for (int i = 0; i < cigar.numCigarElements(); i++) { - final CigarElement ce = cigar.getCigarElement(i); - if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { - cigarToAlign.add(ce); - final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false); - for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); } - refIndex += cigarToAlign.getReferenceLength(); - readIndex += cigarToAlign.getReadLength(); - cigarToAlign = new Cigar(); - } else { - cigarToAlign.add(ce); - } - } - if( !cigarToAlign.isEmpty() ) { - for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) { - cigarToReturn.add(toAdd); - } - } - - final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn); - if( result.getReferenceLength() != cigar.getReferenceLength() ) - throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result); - return result; - } - - /** * Tests that this and other have the same score and vertices in the same order with the same seq * @param other the other path to consider. Cannot be null @@ -463,4 +367,5 @@ public class Path { } return true; } + } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java new file mode 100644 index 000000000..f2106ffb9 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java @@ -0,0 +1,242 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; + +import java.util.ArrayList; +import java.util.Map; +import java.util.PriorityQueue; + +/** +* General recursive sub-haplotype finder. +*

+* Provides the k-best sub-haplotypes looking into the outgoing set of vertices (that contain at least one solution). +*

+*

+* This is done efficiently by keeping an priority-queue on best subhaplotype solutions and pulling them on demand +* as needed. +*

+*

+* Solutions are cached for repeated retrieval so that we save compute at vertices that share sub-haplotypes +* (share descendant vertices). This aspect is controlled by {@link KBestSubHaplotypeFinder} that instantiate +* a unique {@link KBestSubHaplotypeFinder} for each vertex in the graph that belongs to a valid path +* between the source and sink node. +*

+* +* @author Valentin Ruano-Rubio <valentin@broadinstitute.org> +*/ +class RecursiveSubHaplotypeFinder implements KBestSubHaplotypeFinder { + + private final SeqGraph graph; + + private final SeqVertex vertex; + + private final Map children; + + private boolean childrenWereProcessed = false; + + /** + * Holds the number of possible paths from this source node vertex to the sink vertex. + * + *

Updated by {@link #processChildrenIfNeeded()}

+ */ + private int possibleHaplotypeCount; + + /** + * Holds the best {@code i} paths to the sink so far calculated where {@code i+1} is the length of rankedResults. + * + *

As more results are requested the array will grow. All positions and solutions are calculated up to {@code i}

. + */ + private ArrayList rankedResults; + + /** + * Priority queue with best sub-haplotype solutions that haven't been calculated and cached on {@link #rankedResults} yet. + */ + private PriorityQueue nextChildrenKBestHaplotypePath; + + /** + * Creates a recursive sub-haplotype finder give the target graph, first vertex and all possible outgoing edges + * with the corresponding sub-sub-haplotype finders. + * + *

For efficiency shake, it will not verify the content of {@code children} map; i.e. that indeed all keys + * are outgoing edges from {@code vertex} on {@code graph} and that the value sub-haplotype resolver have as + * the first vertex the adjacent vertex through that key edge.

+ * + * @param graph the search graph. + * @param vertex first vertex for all sub-haplotype solutions provided by this finder + * @param children map from outgoing edge to the corresponding sub-sub-haplotype finder. + */ + public RecursiveSubHaplotypeFinder(final SeqGraph graph, final SeqVertex vertex, + final Map children) { + if (vertex == null) throw new IllegalArgumentException("the vertex provided cannot be null"); + if (graph == null) throw new IllegalArgumentException("the graph provided cannot be null"); + this.vertex = vertex; + this.children = children; + this.graph = graph; + } + + @Override + public int getCount() { + processChildrenIfNeeded(); + return possibleHaplotypeCount; + } + + /** + * Process children and initialize structures if not done before. + */ + private void processChildrenIfNeeded() { + if (childrenWereProcessed) return; + long possibleHaplotypeCount = 0; + + nextChildrenKBestHaplotypePath = new PriorityQueue<>(children.size()); + + for (final Map.Entry entry : children.entrySet()) { + final KBestSubHaplotypeFinder child = entry.getValue(); + final BaseEdge edge = entry.getKey(); + final int childPossibleHaplotypePathCount = child.getCount(); + if (childPossibleHaplotypePathCount != 0) // paranoia check, should not happen at this point. + nextChildrenKBestHaplotypePath.add(new ChildKBestSubHaplotype(-1,edge,child,0)); + possibleHaplotypeCount += childPossibleHaplotypePathCount; + } + + // Just make sure we won't incur in overflow here for very large graphs; who is ever going to ask for more than 2G paths!!!) + this.possibleHaplotypeCount = (int) Math.min(Integer.MAX_VALUE,possibleHaplotypeCount); + + // 10 is a bit arbitrary as it is difficult to anticipate what would be the number of requested + // best sub-haplotypes for any node. It shouldn't be too large so that it does not waste space + // but not too small so that there is no need to resize when just a few best solutions are requested. + rankedResults = new ArrayList<>(Math.min(this.possibleHaplotypeCount,10)); + + childrenWereProcessed = true; + } + + @Override + public KBestHaplotype getKBest(int k) { + if (k < 0) + throw new IllegalArgumentException("the rank requested cannot be negative"); + processChildrenIfNeeded(); + if (k >= possibleHaplotypeCount) + throw new IllegalArgumentException("the rank requested cannot be equal or greater to the number of possible haplotypes"); + if (rankedResults.size() > k) + return rankedResults.get(k); + + rankedResults.ensureCapacity(k+1); + for (int i = rankedResults.size(); i <= k; i++) { + // since k < possibleHaplotypeCount is guarantee no to be empty. + if (nextChildrenKBestHaplotypePath.isEmpty()) + throw new IllegalStateException("what the heck " + k + " " + possibleHaplotypeCount); + final ChildKBestSubHaplotype nextResult = nextChildrenKBestHaplotypePath.remove(); + nextResult.rank = i; + rankedResults.add(nextResult); + final int childRank = nextResult.subpath.rank(); + final KBestSubHaplotypeFinder child = nextResult.child; + + // if there is no further solution from the same child we cannot add another solution from that child. + if (childRank + 1 >= nextResult.child.getCount()) + continue; + nextChildrenKBestHaplotypePath.add(new ChildKBestSubHaplotype(-1,nextResult.edge, child, childRank + 1)); + } + return rankedResults.get(k); + } + + /** + * Custom extension of the {@link KBestHaplotype} used for solutions generated by this class. + */ + private class ChildKBestSubHaplotype extends KBestHaplotype implements Comparable{ + private final int score; + private int rank; + private final KBestSubHaplotypeFinder child; + private final BaseEdge edge; + private final KBestHaplotype subpath; + private final boolean isReference; + + public ChildKBestSubHaplotype(final int rank, final BaseEdge edge, + final KBestSubHaplotypeFinder child, final int childRank) { + this.child = child; + this.edge = edge; + this.rank = rank; + this.subpath = child.getKBest(childRank); + this.score = edge.getMultiplicity() + subpath.score(); + this.isReference = edge.isRef() && subpath.isReference(); + } + + @Override + public SeqGraph graph() { + return graph; + } + + @Override + public int compareTo(final ChildKBestSubHaplotype other) { + if (other == null) throw new IllegalArgumentException("the other object cannot be null"); + return - Integer.compare(this.score,other.score); + } + + @Override + public int score() { + return score; + } + + @Override + public int rank() { + return rank; + } + + + @Override + protected SeqVertex head() { + return vertex; + } + + @Override + protected KBestHaplotype tail() { + return subpath; + } + + @Override + public boolean isReference() { + return isReference; + } + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java index 6d839a832..2a7ead6c2 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/haplotypeBAMWriter/HaplotypeBAMWriter.java @@ -50,16 +50,19 @@ import net.sf.samtools.Cigar; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMTag; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.Path; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.haplotype.Haplotype; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; -import java.util.*; +import java.util.Collection; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; /** * A BAMWriter that aligns reads to haplotypes and emits their best alignments to a BAM file @@ -220,7 +223,7 @@ public abstract class HaplotypeBAMWriter { try { // compute the smith-waterman alignment of read -> haplotype - final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), Path.NEW_SW_PARAMETERS); + final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getReadBases(), CigarUtils.NEW_SW_PARAMETERS); //swPairwiseAlignment.printAlignment(haplotype.getBases(), originalRead.getReadBases()); if ( swPairwiseAlignment.getAlignmentStart2wrt1() == -1 ) // sw can fail (reasons not clear) so if it happens just don't write the read diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java index fa7ad9a3d..f117f5750 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java @@ -53,6 +53,7 @@ import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.CigarUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -132,7 +133,7 @@ public class KBestPathsUnitTest extends BaseTest { Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); } - @Test(enabled = !DEBUG) + @Test(enabled = false) // No longer supported, but no longer needed. public void testPathFindingComplexCycle() { SeqGraph graph = new SeqGraph(11); @@ -152,7 +153,7 @@ public class KBestPathsUnitTest extends BaseTest { Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); } - @Test(enabled = !DEBUG) + @Test(enabled = false) // No longer supported, but no longer needed. public void testPathFindingCycleLastNode() { SeqGraph graph = new SeqGraph(11); @@ -539,7 +540,7 @@ public class KBestPathsUnitTest extends BaseTest { String theRef = preRefString + refString + Utils.dupString(indelString1, refIndel1) + refString + Utils.dupString(indelString2, refIndel2) + refString + postRefString; String theRead = refString + Utils.dupString(indelString1, refIndel1 + indelOp1 * indelSize1) + refString + Utils.dupString(indelString2, refIndel2 + indelOp2 * indelSize2) + refString; - Cigar calculatedCigar = Path.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); + Cigar calculatedCigar = CigarUtils.leftAlignCigarSequentially(AlignmentUtils.consolidateCigar(givenCigar), theRef.getBytes(), theRead.getBytes(), preRefString.length(), 0); Assert.assertEquals(AlignmentUtils.consolidateCigar(calculatedCigar).toString(), AlignmentUtils.consolidateCigar(expectedCigar).toString(), "Cigar strings do not match!"); } } @@ -553,7 +554,7 @@ public class KBestPathsUnitTest extends BaseTest { final String hap = "GTCTCTCTCTCTCTCTCTCTCTATATATATATATTT"; final Cigar originalCigar = TextCigarCodec.getSingleton().decode("18M4I12M4D2M"); - final Cigar result = Path.leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0); + final Cigar result = CigarUtils.leftAlignCigarSequentially(originalCigar, ref.getBytes(), hap.getBytes(), 0, 0); logger.warn("Result is " + result); Assert.assertEquals(originalCigar.getReferenceLength(), result.getReferenceLength(), "Reference lengths are different"); } diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java index a516ec11e..70ce68a5b 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/utils/sam/CigarUtils.java @@ -25,12 +25,17 @@ package org.broadinstitute.sting.utils.sam; +import com.google.java.contract.Ensures; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.smithwaterman.Parameters; +import org.broadinstitute.sting.utils.smithwaterman.SWPairwiseAlignment; +import org.broadinstitute.sting.utils.smithwaterman.SmithWaterman; +import java.util.Arrays; import java.util.Stack; /** @@ -164,4 +169,104 @@ public class CigarUtils { } return result; } + + // used in the bubble state machine to apply Smith-Waterman to the bubble sequence + // these values were chosen via optimization against the NA12878 knowledge base + public static final Parameters NEW_SW_PARAMETERS = new Parameters(20.0, -15.0, -26.0, -1.1); + + private final static String SW_PAD = "NNNNNNNNNN"; + + /** + * Calculate the cigar elements for this path against the reference sequence + * + * @param refSeq the reference sequence that all of the bases in this path should align to + * @return a Cigar mapping this path to refSeq, or null if no reasonable alignment could be found + */ + public static Cigar calculateCigar(final byte[] refSeq, final byte[] altSeq) { + if ( altSeq.length == 0 ) { + // horrible edge case from the unit tests, where this path has no bases + return new Cigar(Arrays.asList(new CigarElement(refSeq.length, CigarOperator.D))); + } + + final Cigar nonStandard; + + final String paddedRef = SW_PAD + new String(refSeq) + SW_PAD; + final String paddedPath = SW_PAD + new String(altSeq) + SW_PAD; + final SmithWaterman alignment = new SWPairwiseAlignment( paddedRef.getBytes(), paddedPath.getBytes(), NEW_SW_PARAMETERS ); + + if ( isSWFailure(alignment) ) + return null; + + // cut off the padding bases + final int baseStart = SW_PAD.length(); + final int baseEnd = paddedPath.length() - SW_PAD.length() - 1; // -1 because it's inclusive + nonStandard = AlignmentUtils.trimCigarByBases(alignment.getCigar(), baseStart, baseEnd); + + if ( nonStandard.getReferenceLength() != refSeq.length ) { + nonStandard.add(new CigarElement(refSeq.length - nonStandard.getReferenceLength(), CigarOperator.D)); + } + + // finally, return the cigar with all indels left aligned + return leftAlignCigarSequentially(nonStandard, refSeq, altSeq, 0, 0); + } + + /** + * Make sure that the SW didn't fail in some terrible way, and throw exception if it did + */ + private static boolean isSWFailure(final SmithWaterman alignment) { + // check that the alignment starts at the first base, which it should given the padding + if ( alignment.getAlignmentStart2wrt1() > 0 ) { + return true; +// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should always start at 0, but got " + alignment.getAlignmentStart2wrt1() + " with cigar " + alignment.getCigar()); + } + + // check that we aren't getting any S operators (which would be very bad downstream) + for ( final CigarElement ce : alignment.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.S ) + return true; + // soft clips at the end of the alignment are really insertions +// throw new IllegalStateException("SW failure ref " + paddedRef + " vs. " + paddedPath + " should never contain S operators but got cigar " + alignment.getCigar()); + } + + return false; + } + + /** + * Left align the given cigar sequentially. This is needed because AlignmentUtils doesn't accept cigars with more than one indel in them. + * This is a target of future work to incorporate and generalize into AlignmentUtils for use by others. + * @param cigar the cigar to left align + * @param refSeq the reference byte array + * @param readSeq the read byte array + * @param refIndex 0-based alignment start position on ref + * @param readIndex 0-based alignment start position on read + * @return the left-aligned cigar + */ + @Ensures({"cigar != null", "refSeq != null", "readSeq != null", "refIndex >= 0", "readIndex >= 0"}) + public static Cigar leftAlignCigarSequentially(final Cigar cigar, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) { + final Cigar cigarToReturn = new Cigar(); + Cigar cigarToAlign = new Cigar(); + for (int i = 0; i < cigar.numCigarElements(); i++) { + final CigarElement ce = cigar.getCigarElement(i); + if (ce.getOperator() == CigarOperator.D || ce.getOperator() == CigarOperator.I) { + cigarToAlign.add(ce); + final Cigar leftAligned = AlignmentUtils.leftAlignSingleIndel(cigarToAlign, refSeq, readSeq, refIndex, readIndex, false); + for ( final CigarElement toAdd : leftAligned.getCigarElements() ) { cigarToReturn.add(toAdd); } + refIndex += cigarToAlign.getReferenceLength(); + readIndex += cigarToAlign.getReadLength(); + cigarToAlign = new Cigar(); + } else { + cigarToAlign.add(ce); + } + } + if( !cigarToAlign.isEmpty() ) { + for( final CigarElement toAdd : cigarToAlign.getCigarElements() ) { + cigarToReturn.add(toAdd); + } + } + + final Cigar result = AlignmentUtils.consolidateCigar(cigarToReturn); + if( result.getReferenceLength() != cigar.getReferenceLength() ) + throw new IllegalStateException("leftAlignCigarSequentially failed to produce a valid CIGAR. Reference lengths differ. Initial cigar " + cigar + " left aligned into " + result); + return result; + } } From 69bf2b32470fc3565d89cae656f60124ffd3f766 Mon Sep 17 00:00:00 2001 From: Valentin Ruano-Rubio Date: Mon, 3 Mar 2014 16:33:05 -0500 Subject: [PATCH 12/18] Added a more efficient implementation of the KBest haplotype finder code (CONT.) Changes: 1. Addressed review comments on new K-best haplotype assembly graph finder. 2. Generalize KBestHaplotypeFinder to deal with multiple source and sink vertices. 3. Updated test to use KBestHaplotypeFinder instead of KBestPaths 4. Retired KBestPaths to the archive. 5. Small improvements to the code and documentation. --- ...java => AggregatedSubHaplotypeFinder.java} | 275 +++++++++--------- .../haplotypecaller/graphs/BaseGraph.java | 17 ++ .../DeadEndKBestSubHaplotypeFinder.java | 4 +- .../graphs/EmptyPathHaplotypeFinder.java | 28 +- .../graphs/KBestHaplotype.java | 16 +- .../graphs/KBestHaplotypeFinder.java | 233 +++++++++++---- .../graphs/KBestSubHaplotypeFinder.java | 1 - .../graphs/RecursiveSubHaplotypeFinder.java | 164 +++-------- .../readthreading/ReadThreadingAssembler.java | 15 +- .../graphs/CommonSuffixMergerUnitTest.java | 19 +- ...java => KBestHaplotypeFinderUnitTest.java} | 161 ++++------ .../SharedVertexSequenceSplitterUnitTest.java | 39 ++- .../DanglingChainMergingGraphUnitTest.java | 10 +- .../ReadThreadingAssemblerUnitTest.java | 20 +- .../ReadThreadingGraphUnitTest.java | 3 +- .../gatk/refdata/tracks/RMDTrackBuilder.java | 5 +- 16 files changed, 525 insertions(+), 485 deletions(-) rename protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/{KBestPaths.java => AggregatedSubHaplotypeFinder.java} (59%) rename protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/{KBestPathsUnitTest.java => KBestHaplotypeFinderUnitTest.java} (82%) diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java similarity index 59% rename from protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java rename to protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java index 3ba85dd92..8fba6c9d5 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPaths.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/AggregatedSubHaplotypeFinder.java @@ -1,185 +1,194 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. -* +* * 2. LICENSE -* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. -* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* -* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012 Broad Institute, Inc. * Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 4. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 5. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 6. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 7. MISCELLANEOUS * 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. * 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. -* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. -* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. * 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. * 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. */ - package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; -import com.google.common.collect.MinMaxPriorityQueue; -import com.google.java.contract.Ensures; - -import java.io.Serializable; -import java.util.*; +import java.util.ArrayList; +import java.util.Collection; +import java.util.PriorityQueue; /** - * Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph. - * This is different from most graph traversals because we want to test paths from any source node to any sink node. + * K-best sub-haplotype finder that selects the best solutions out of a collection of sub-haplotype finders. * - * User: ebanks, rpoplin, mdepristo - * Date: Mar 23, 2011 + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -public class KBestPaths { - private final boolean allowCycles; +class AggregatedSubHaplotypeFinder implements KBestSubHaplotypeFinder { /** - * Create a new KBestPaths finder that follows cycles in the graph + * Collection of subFinders that provided the actual solutions. */ - public KBestPaths() { - this(true); - } + private final Collection subFinders; /** - * Create a new KBestPaths finder + * Flag indicating whether the sub-finders have been processed or not. + */ + private boolean processedSubFinders = false; + + /** + * Holds the number of k-best solution that this finder would ever return. + */ + private int count = 0; + + /** + * Holds the best {@code i} paths to the sink so far calculated where {@code i+1} is the length of this list. * - * @param allowCycles should we allow paths that follow cycles in the graph? + *

As more results are requested the array will grow. All positions and solutions are + * calculated up to {@code i}

. */ - public KBestPaths(final boolean allowCycles) { - this.allowCycles = allowCycles; - } - - protected static class MyInt { public int val = 0; } + private ArrayList rankedSubHaplotype; /** - * Compare paths such that paths with greater weight are earlier in a list + * Priority queue with next best haplotype solution from each sub-finder; previous ones are + * already part {@link #rankedSubHaplotype}. */ - protected static class PathComparatorTotalScore implements Comparator, Serializable { + private PriorityQueue nextBestSubHaplotypes; + + /** + * Creates a new aggregated sub-haplotype finder given its sub-finders. + * @param finders set of sub-finders. + */ + public AggregatedSubHaplotypeFinder(final Collection finders) { + if (finders == null) throw new IllegalArgumentException("finder collection cannot be null"); + this.subFinders = finders; + } + + @Override + public int getCount() { + processSubFindersIfNeeded(); + return count; + } + + private void processSubFindersIfNeeded() { + if (processedSubFinders) return; + + long count = 0; + nextBestSubHaplotypes = new PriorityQueue<>(subFinders.size()); + for (final KBestSubHaplotypeFinder finder : subFinders) { + final int finderCount = finder.getCount(); + if (finderCount == 0) continue; + count += finderCount; + nextBestSubHaplotypes.add(new MyKBestHaplotypeResult(finder,0)); + } + + this.count = (int) Math.min(Integer.MAX_VALUE,count); + + rankedSubHaplotype = new ArrayList<>(10); + processedSubFinders = true; + } + + @Override + public KBestHaplotype getKBest(int k) { + if (k < 0) throw new IllegalArgumentException("k cannot be negative"); + processSubFindersIfNeeded(); + if (k >= count) throw new IllegalArgumentException("k cannot be equal or greater than the count"); + if (k < rankedSubHaplotype.size()) + return rankedSubHaplotype.get(k); + + rankedSubHaplotype.ensureCapacity(k+1); + for (int i = rankedSubHaplotype.size(); i <= k; i++) { + // since k < possibleHaplotypeCount is guarantee no to be empty. + if (nextBestSubHaplotypes.isEmpty()) + throw new IllegalStateException("what the heck " + k + " " + count); + final MyKBestHaplotypeResult nextResult = nextBestSubHaplotypes.remove(); + nextResult.rank = i; + rankedSubHaplotype.add(nextResult); + final int subRank = nextResult.result.rank(); + + // if there is no further solution from the same child we cannot add another solution from that child. + if (subRank + 1 >= nextResult.subFinder.getCount()) + continue; + nextBestSubHaplotypes.add(new MyKBestHaplotypeResult(nextResult.subFinder, subRank + 1)); + } + return rankedSubHaplotype.get(k); + } + + /** + * Custom implementation of {@link KBestHaplotype} to encapsulate sub-finder results. + */ + private class MyKBestHaplotypeResult extends KBestHaplotype { + + private KBestSubHaplotypeFinder subFinder; + + private final KBestHaplotype result; + + private int rank; + + private MyKBestHaplotypeResult(final KBestSubHaplotypeFinder finder, final int rank) { + this.subFinder = finder; + this.result = finder.getKBest(rank); + this.rank = -1; + } + @Override - public int compare(final Path path1, final Path path2) { - return path2.getScore() - path1.getScore(); - } - } - - /** - * @see #getKBestPaths(BaseGraph, int) retriving the best 1000 paths - */ - public List> getKBestPaths( final BaseGraph graph ) { - return getKBestPaths(graph, 1000); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) retriving the first 1000 paths - * starting from all source vertices and ending with all sink vertices - */ - public List> getKBestPaths( final BaseGraph graph, final int k ) { - return getKBestPaths(graph, k, graph.getSources(), graph.getSinks()); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 - */ - public List> getKBestPaths( final BaseGraph graph, final Set sources, final Set sinks ) { - return getKBestPaths(graph, 1000, sources, sinks); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with k=1000 - */ - public List> getKBestPaths( final BaseGraph graph, final T source, final T sink ) { - return getKBestPaths(graph, 1000, source, sink); - } - - /** - * @see #getKBestPaths(BaseGraph, int, java.util.Set, java.util.Set) with singleton source and sink sets - */ - public List> getKBestPaths( final BaseGraph graph, final int k, final T source, final T sink ) { - return getKBestPaths(graph, k, Collections.singleton(source), Collections.singleton(sink)); - } - - /** - * Traverse the graph and pull out the best k paths. - * Paths are scored via their comparator function. The default being PathComparatorTotalScore() - * @param graph the graph from which to pull paths - * @param k the number of paths to find - * @param sources a set of vertices we want to start paths with - * @param sinks a set of vertices we want to end paths with - * @return a list with at most k top-scoring paths from the graph - */ - @Ensures({"result != null", "result.size() <= k"}) - public List> getKBestPaths( final BaseGraph graph, final int k, final Set sources, final Set sinks ) { - if( graph == null ) { throw new IllegalArgumentException("Attempting to traverse a null graph."); } - - // a min max queue that will collect the best k paths - final MinMaxPriorityQueue> bestPaths = MinMaxPriorityQueue.orderedBy(new PathComparatorTotalScore()).maximumSize(k).create(); - - // run a DFS for best paths - for ( final T source : sources ) { - final Path startingPath = new Path(source, graph); - findBestPaths(startingPath, sinks, bestPaths, new MyInt()); + public SeqGraph graph() { + return result.graph(); } - // the MinMaxPriorityQueue iterator returns items in an arbitrary order, so we need to sort the final result - final List> toReturn = new ArrayList>(bestPaths); - Collections.sort(toReturn, new PathComparatorTotalScore()); - return toReturn; - } + @Override + public int score() { + return result.score(); + } - /** - * Recursive algorithm to find the K best paths in the graph from the current path to any of the sinks - * @param path the current path progress - * @param sinks a set of nodes that are sinks. Will terminate and add a path if the last vertex of path is in this set - * @param bestPaths a path to collect completed paths. - * @param n used to limit the search by tracking the number of vertices visited across all paths - */ - private void findBestPaths( final Path path, final Set sinks, final Collection> bestPaths, final MyInt n ) { - if ( sinks.contains(path.getLastVertex())) { - bestPaths.add(path); - } else if( n.val > 10000 ) { - // do nothing, just return, as we've done too much work already - } else { - // recursively run DFS - final ArrayList edgeArrayList = new ArrayList(path.getOutgoingEdgesOfLastVertex()); - Collections.sort(edgeArrayList, new BaseEdge.EdgeWeightComparator()); - for ( final E edge : edgeArrayList ) { - final T target = path.getGraph().getEdgeTarget(edge); - // make sure the edge is not already in the path - final boolean alreadyVisited = allowCycles ? path.containsEdge(edge) : path.containsVertex(target); - if ( ! alreadyVisited ) { - final Path newPath = new Path(path, edge); - n.val++; - findBestPaths(newPath, sinks, bestPaths, n); - } - } + @Override + public boolean isReference() { + return result.isReference(); + } + + @Override + public int rank() { + return rank; + } + + @Override + protected SeqVertex head() { + return result.head(); + } + + @Override + protected KBestHaplotype tail() { + return result.tail(); } } -} \ No newline at end of file +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java index c9d51b81b..36216bdd2 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/BaseGraph.java @@ -695,4 +695,21 @@ public class BaseGraph extends Default public BaseGraph subsetToRefSource() { return subsetToNeighbors(getReferenceSourceVertex(), 10); } + + /** + * Checks whether the graph contains all the vertices in a collection. + * + * @param vertices the vertices to check. + * + * @throws IllegalArgumentException if {@code vertices} is {@code null}. + * + * @return {@code true} if all the vertices in the input collection are present in this graph. + * Also if the input collection is empty. Otherwise it returns {@code false}. + */ + public boolean containsAllVertices(final Collection vertices) { + if (vertices == null) throw new IllegalArgumentException("the input vertices collection cannot be null"); + for (final V vertex : vertices) + if (!containsVertex(vertex)) return false; + return true; + } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java index 0a29bd08b..ae270ed7b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/DeadEndKBestSubHaplotypeFinder.java @@ -46,9 +46,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; /** - * Represent a trivial k-best sub haplotype finder with no solutions. + * Represents a trivial k-best sub haplotype finder with no solutions. * - *

To be used at vertices that do not have any valid path to the requested sink node

+ *

To be used at vertices that do not have any valid path to the requested sink vertices

* * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java index aa1f213fe..0e50ec02b 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/EmptyPathHaplotypeFinder.java @@ -52,14 +52,19 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; */ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder { - /** * Caches the only solution returned by this finder. */ private final KBestHaplotype singleHaplotypePath; - public EmptyPathHaplotypeFinderNode(final SeqGraph graph, final SeqVertex sink) { - singleHaplotypePath = new MyBestHaplotypePath(graph,sink); + /** + * Constructs a new empty k-best haplotype finder. + * + * @param graph the search graph. + * @param vertex the source and sink vertex of the only solution returned by this finder. + */ + public EmptyPathHaplotypeFinderNode(final SeqGraph graph, final SeqVertex vertex) { + singleHaplotypePath = new MyBestHaplotypePath(graph,vertex); } @Override @@ -81,12 +86,29 @@ class EmptyPathHaplotypeFinderNode implements KBestSubHaplotypeFinder { */ private class MyBestHaplotypePath extends KBestHaplotype { + /** + * The solution's only vertex. + */ private final SeqVertex vertex; + /** + * The search graph. + */ private final SeqGraph graph; + /** + * Whether the vertex is a reference vertex. + * + *

Initialize lazily.

+ */ private Boolean isReference; + /** + * Constructs a new empty k-best haplotype solution. + * + * @param graph the search graph. + * @param vertex the source and sink vertex of the only solution returned by the outer finder. + */ public MyBestHaplotypePath(final SeqGraph graph, final SeqVertex vertex) { this.vertex = vertex; this.graph = graph; diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java index d88c17cbf..ca22f17ec 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotype.java @@ -52,7 +52,7 @@ import org.broadinstitute.sting.utils.haplotype.Haplotype; * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -public abstract class KBestHaplotype { +public abstract class KBestHaplotype implements Comparable { /** * Returns the original graph searched. @@ -143,6 +143,18 @@ public abstract class KBestHaplotype { return path; } + /** + * Compares k-best haplotypes based on the score where the one with larger score comes first (descending orther). + * + * @param other the other haplotype to compare to. + * @return {@code -1} if the current score is larger than {@code other}'s, {@code 0} if they are the same, {@code 1} + * if {@code other}'s score is larger. + */ + public int compareTo(final KBestHaplotype other) { + if (other == null) throw new IllegalArgumentException("the other object cannot be null"); + return - 1 * (score() - other.score()); + } + /** * The first vertex on the haplotype path. * @@ -156,6 +168,4 @@ public abstract class KBestHaplotype { * @return {@code null} if there are no more vertices in the solution path a part from the one returned by {@link #head}. */ protected abstract KBestHaplotype tail(); - - } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java index 725fcae1a..f27cca12c 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinder.java @@ -56,49 +56,131 @@ import java.util.*; */ public class KBestHaplotypeFinder extends AbstractList implements Iterable { - + /** + * The search graph. + */ private final SeqGraph graph; - protected Map nodeBySource; - protected SeqVertex sink; - protected SeqVertex source; + /** + * Map of sub-haplotype finder by their source vertex. + */ + protected Map finderByVertex; + + /** + * Possible haplotype sink vertices. + */ + protected Set sinks; + + /** + * Possible haplotype source vertices. + */ + protected Set sources; + + /** + * The top finder. + * + *

If there is only a single source vertex, its finder is the top finder. However whent there + * is more than one possible source, we create a composite finder that alternates between individual source vertices + * for their best haplotypes.

+ */ + private final KBestSubHaplotypeFinder topFinder; /** * Constructs a new best haplotypes finder. * * @param graph the seq-graph to search. * @param source the source vertex for all haplotypes. - * @param sink the sink vertex for all haplotypes. + * @param sink sink vertices for all haplotypes. * * @throws IllegalArgumentException if
    *
  • any of {@code graph}, {@code source} or {@code sink} is {@code null} or
  • - *
  • either {@code source} or {@code sink} is not a vertex of {@code graph}'s.
  • + *
  • either {@code source} or {@code sink} is not a vertex in {@code graph}.
  • *
*/ public KBestHaplotypeFinder(final SeqGraph graph, final SeqVertex source, final SeqVertex sink) { - if (graph == null) throw new IllegalArgumentException("graph cannot be null"); - if (source == null) throw new IllegalArgumentException("source cannot be null"); - if (sink == null) throw new IllegalArgumentException("sink cannot be null"); - if (!graph.containsVertex(source)) throw new IllegalArgumentException("source does not belong to the graph"); - if (!graph.containsVertex(sink)) throw new IllegalArgumentException("sink does not belong to the graph"); - //TODO dealing with cycles here due to a bug in some of the graph transformations that produces cycles. - //TODO Once that is solve, the conditional above should be removed in order the save time: - //this.graph = graph; - if (new CycleDetector<>(graph).detectCycles()) - this.graph = removeCycles(graph,source,sink); - else - this.graph = graph; - nodeBySource = new HashMap<>(graph.vertexSet().size()); - this.sink = sink; - this.source = source; + this(graph,Collections.singleton(source),Collections.singleton(sink)); } - private static SeqGraph removeCycles(final SeqGraph original, final SeqVertex source, final SeqVertex sink) { + /** + * Constructs a new best haplotypes finder. + * + * @param graph the seq-graph to search. + * @param sources source vertices for all haplotypes. + * @param sinks sink vertices for all haplotypes. + * + * @throws IllegalArgumentException if
    + *
  • any of {@code graph}, {@code sources} or {@code sinks} is {@code null} or
  • + *
  • any of {@code sources}' or any {@code sinks}' member is not a vertex in {@code graph}.
  • + *
+ */ + public KBestHaplotypeFinder(final SeqGraph graph, final Set sources, final Set sinks) { + if (graph == null) throw new IllegalArgumentException("graph cannot be null"); + if (sources == null) throw new IllegalArgumentException("source cannot be null"); + if (sinks == null) throw new IllegalArgumentException("sink cannot be null"); + if (!graph.containsAllVertices(sources)) throw new IllegalArgumentException("source does not belong to the graph"); + if (!graph.containsAllVertices(sinks)) throw new IllegalArgumentException("sink does not belong to the graph"); + + //TODO dealing with cycles here due to a bug in some of the graph transformations that produces cycles. + //TODO Once that is solve, the if-else below should be substituted by a throw if there is any cycles, + //TODO just the line commented out below if you want to trade early-bug-fail for speed. + //this.graph = graph; + this.graph = new CycleDetector<>(graph).detectCycles() ? removeCycles(graph,sources,sinks) : graph; + + finderByVertex = new HashMap<>(this.graph.vertexSet().size()); + this.sinks = sinks; + this.sources = sources; + if (sinks.size() == 0 || sources.size() == 0) + topFinder = DeadEndKBestSubHaplotypeFinder.INSTANCE; + else if (sources.size() == 1) + topFinder = createVertexFinder(sources.iterator().next()); + else + topFinder = createAggregatedFinder(); + } + + /** + * Constructs a new best haplotype finder. + *

+ * It will consider all source and sink vertex when looking for haplotypes. + *

+ * + * @param graph the seq-graph to search for the best haplotypes. + */ + public KBestHaplotypeFinder(SeqGraph graph) { + this(graph,graph.getSources(),graph.getSinks()); + } + + /** + * Creates an aggregated recursive finder to try all possible source vertices. + * + * @return never {@code null}. + */ + private KBestSubHaplotypeFinder createAggregatedFinder() { + final List sourceFinders = new ArrayList<>(sources.size()); + for (final SeqVertex source : sources) + sourceFinders.add(createVertexFinder(source)); + return new AggregatedSubHaplotypeFinder(sourceFinders); + } + + /** + * Removes edges that produces cycles and also dead vertices that do not lead to any sink vertex. + * + * @param original graph to modify. + * @param sources considered source vertices. + * @param sinks considered sink vertices. + * @return never {@code null}. + */ + private static SeqGraph removeCycles(final SeqGraph original, final Set sources, final Set sinks) { final Set edgesToRemove = new HashSet<>(original.edgeSet().size()); final Set vertexToRemove = new HashSet<>(original.vertexSet().size()); - if (!findGuiltyVerticesAndEdgesToRemoveCycles(original, source, sink, edgesToRemove, vertexToRemove, new HashSet(original.vertexSet().size()))) - throw new IllegalStateException("could not find any path from the source vertex to the sink vertex: " + source + " -> " + sink); + boolean foundSomePath = false; + for (final SeqVertex source : sources) + foundSomePath = findGuiltyVerticesAndEdgesToRemoveCycles(original, source, sinks, edgesToRemove, + vertexToRemove, new HashSet(original.vertexSet().size())) | foundSomePath; + + if (!foundSomePath) + throw new IllegalStateException("could not find any path from the source vertex to the sink vertex after removing cycles: " + + Arrays.toString(sources.toArray()) + " => " + Arrays.toString(sinks.toArray())); if (edgesToRemove.isEmpty() && vertexToRemove.isEmpty()) throw new IllegalStateException("cannot find a way to remove the cycles"); @@ -109,13 +191,30 @@ public class KBestHaplotypeFinder extends AbstractList implement return result; } - private static boolean findGuiltyVerticesAndEdgesToRemoveCycles(final SeqGraph graph, final SeqVertex currentVertex, final SeqVertex sink, - final Set edgesToRemove, final Set verticesToRemove, + /** + * Recursive call that looks for edges and vertices that need to be removed to get rid of cycles. + * + * @param graph the original graph. + * @param currentVertex current search vertex. + * @param sinks considered sink vertices. + * @param edgesToRemove collection of edges that need to be removed in order to get rid of cycles. + * @param verticesToRemove collection of vertices that can be removed. + * @param parentVertices collection of vertices that preceded the {@code currentVertex}; i.e. the it can be + * reached from those vertices using edges existing in {@code graph}. + * + * @return {@code true} to indicate that the some sink vertex is reachable by {@code currentVertex}, + * {@code false} otherwise. + */ + private static boolean findGuiltyVerticesAndEdgesToRemoveCycles(final SeqGraph graph, + final SeqVertex currentVertex, + final Set sinks, + final Set edgesToRemove, + final Set verticesToRemove, final Set parentVertices) { - if (currentVertex.equals(sink)) return true; + if (sinks.contains(currentVertex)) return true; final Set outgoingEdges = graph.outgoingEdgesOf(currentVertex); - boolean reachsSink = false; + boolean reachesSink = false; parentVertices.add(currentVertex); for (final BaseEdge edge : outgoingEdges) { @@ -123,31 +222,28 @@ public class KBestHaplotypeFinder extends AbstractList implement if (parentVertices.contains(child)) edgesToRemove.add(edge); else { - final boolean childReachSink = findGuiltyVerticesAndEdgesToRemoveCycles(graph, child, sink, edgesToRemove, verticesToRemove, parentVertices); - reachsSink = reachsSink || childReachSink; + final boolean childReachSink = findGuiltyVerticesAndEdgesToRemoveCycles(graph, child, sinks, + edgesToRemove, verticesToRemove, parentVertices); + reachesSink = reachesSink || childReachSink; } } parentVertices.remove(currentVertex); - if (!reachsSink) verticesToRemove.add(currentVertex); - return reachsSink; + if (!reachesSink) verticesToRemove.add(currentVertex); + return reachesSink; } - @Override public KBestHaplotype get(int index) { - final KBestSubHaplotypeFinder sourceNode = createNode(source); if (index < 0 || index >= size()) throw new IndexOutOfBoundsException(); - return sourceNode.getKBest(index); + return topFinder.getKBest(index); } @Override public Iterator iterator() { - final KBestSubHaplotypeFinder sourceFinder = createNode(source); - return new Iterator() { private int nextK = 0; - private final int maxK = sourceFinder.getCount(); + private final int maxK = topFinder.getCount(); @Override @@ -158,7 +254,7 @@ public class KBestHaplotypeFinder extends AbstractList implement @Override public KBestHaplotype next() { if (nextK >= maxK) throw new NoSuchElementException(); - return sourceFinder.getKBest(nextK++); + return topFinder.getKBest(nextK++); } @Override @@ -170,7 +266,7 @@ public class KBestHaplotypeFinder extends AbstractList implement @Override public int size() { - return createNode(source).getCount(); + return topFinder.getCount(); } /** @@ -183,12 +279,10 @@ public class KBestHaplotypeFinder extends AbstractList implement * @return never {@code null}, but perhaps a iterator that return no haplotype. */ public Iterator iterator(final int k) { - final KBestSubHaplotypeFinder sourceFinder = createNode(source); return new Iterator() { private int nextK = 0; - private final int maxK = Math.min(sourceFinder.getCount(), k); - + private final int maxK = Math.min(size(), k); @Override public boolean hasNext() { @@ -198,7 +292,7 @@ public class KBestHaplotypeFinder extends AbstractList implement @Override public KBestHaplotype next() { if (nextK >= maxK) throw new NoSuchElementException(); - return sourceFinder.getKBest(nextK++); + return topFinder.getKBest(nextK++); } @Override @@ -208,38 +302,51 @@ public class KBestHaplotypeFinder extends AbstractList implement }; } - protected KBestSubHaplotypeFinder createNode(final SeqVertex source) { - KBestSubHaplotypeFinder node = nodeBySource.get(source); + /** + * Creates a finder from a vertex. + * + * @param source the source vertex for the finder. + * + * @return never {@code null}, perhaps a finder that returns no haplotypes though. + */ + protected KBestSubHaplotypeFinder createVertexFinder(final SeqVertex source) { + KBestSubHaplotypeFinder node = finderByVertex.get(source); if (node == null) { - if (source.equals(sink)) - node = new EmptyPathHaplotypeFinderNode(graph,sink); + if (sinks.contains(source)) + node = new EmptyPathHaplotypeFinderNode(graph,source); else { final Set outgoingEdges = graph.outgoingEdgesOf(source); if (outgoingEdges.isEmpty()) node = DeadEndKBestSubHaplotypeFinder.INSTANCE; else { - final Map undeadChildren = createChildrenNodes(outgoingEdges); - if (undeadChildren.isEmpty()) - node = DeadEndKBestSubHaplotypeFinder.INSTANCE; - else - node = new RecursiveSubHaplotypeFinder(graph,source,undeadChildren); - + final Map undeadChildren = createChildrenFinders(outgoingEdges); + node = undeadChildren.isEmpty() ? DeadEndKBestSubHaplotypeFinder.INSTANCE : + new RecursiveSubHaplotypeFinder(source,undeadChildren); } } - nodeBySource.put(source,node); + finderByVertex.put(source, node); } return node; } - private Map createChildrenNodes(Set baseEdges) { + /** + * Creates finder for target vertices of a collection of edges. + *

+ * This peculiar signature is convenient for when we want to create finders for the children of a vertex. + *

+ * + * @param baseEdges target collection of edges. + * + * @return never {@code null}, perhaps an empty map if there is no children with valid paths to any sink for this + * finder. + */ + private Map createChildrenFinders(Set baseEdges) { final Map result = new LinkedHashMap<>(baseEdges.size()); - for (final BaseEdge edge : baseEdges) - result.put(edge,createNode(graph.getEdgeTarget(edge))); - final Iterator> childrenIterator = result.entrySet().iterator(); - while (childrenIterator.hasNext()) - if (childrenIterator.next().getValue().getCount() == 0) - childrenIterator.remove(); + for (final BaseEdge edge : baseEdges) { + final KBestSubHaplotypeFinder targetFinder = createVertexFinder(graph.getEdgeTarget(edge)); + if (targetFinder.getCount() == 0) continue; + result.put(edge, targetFinder); + } return result; } - } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java index a637ff2b6..9c185b52c 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestSubHaplotypeFinder.java @@ -68,5 +68,4 @@ interface KBestSubHaplotypeFinder { * @return never {@code null}. */ public abstract KBestHaplotype getKBest(int k); - } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java index f2106ffb9..0fbbfdc64 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/RecursiveSubHaplotypeFinder.java @@ -46,18 +46,18 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs; import java.util.ArrayList; +import java.util.Collection; import java.util.Map; -import java.util.PriorityQueue; /** * General recursive sub-haplotype finder. *

-* Provides the k-best sub-haplotypes looking into the outgoing set of vertices (that contain at least one solution). +* Provides the k-best sub-haplotypes from a vertex provided map between outgoing edges and its target finders *

*

-* This is done efficiently by keeping an priority-queue on best subhaplotype solutions and pulling them on demand +* This is done efficiently by keeping an priority-queue on best sub-haplotype solutions and pulling them on demand * as needed. -*

+*

*

* Solutions are cached for repeated retrieval so that we save compute at vertices that share sub-haplotypes * (share descendant vertices). This aspect is controlled by {@link KBestSubHaplotypeFinder} that instantiate @@ -67,34 +67,7 @@ import java.util.PriorityQueue; * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -class RecursiveSubHaplotypeFinder implements KBestSubHaplotypeFinder { - - private final SeqGraph graph; - - private final SeqVertex vertex; - - private final Map children; - - private boolean childrenWereProcessed = false; - - /** - * Holds the number of possible paths from this source node vertex to the sink vertex. - * - *

Updated by {@link #processChildrenIfNeeded()}

- */ - private int possibleHaplotypeCount; - - /** - * Holds the best {@code i} paths to the sink so far calculated where {@code i+1} is the length of rankedResults. - * - *

As more results are requested the array will grow. All positions and solutions are calculated up to {@code i}

. - */ - private ArrayList rankedResults; - - /** - * Priority queue with best sub-haplotype solutions that haven't been calculated and cached on {@link #rankedResults} yet. - */ - private PriorityQueue nextChildrenKBestHaplotypePath; +class RecursiveSubHaplotypeFinder extends AggregatedSubHaplotypeFinder { /** * Creates a recursive sub-haplotype finder give the target graph, first vertex and all possible outgoing edges @@ -104,113 +77,73 @@ class RecursiveSubHaplotypeFinder implements KBestSubHaplotypeFinder { * are outgoing edges from {@code vertex} on {@code graph} and that the value sub-haplotype resolver have as * the first vertex the adjacent vertex through that key edge.

* - * @param graph the search graph. * @param vertex first vertex for all sub-haplotype solutions provided by this finder * @param children map from outgoing edge to the corresponding sub-sub-haplotype finder. */ - public RecursiveSubHaplotypeFinder(final SeqGraph graph, final SeqVertex vertex, + public RecursiveSubHaplotypeFinder(final SeqVertex vertex, final Map children) { - if (vertex == null) throw new IllegalArgumentException("the vertex provided cannot be null"); - if (graph == null) throw new IllegalArgumentException("the graph provided cannot be null"); - this.vertex = vertex; - this.children = children; - this.graph = graph; + super(createChildFinderCollection(vertex, children)); } - @Override - public int getCount() { - processChildrenIfNeeded(); - return possibleHaplotypeCount; + private static Collection createChildFinderCollection(final SeqVertex vertex, final Map finders) { + if (finders == null) throw new IllegalArgumentException("the edge to child map cannot be null"); + final Collection result = new ArrayList<>(finders.size()); + for (final Map.Entry e : finders.entrySet()) + result.add(new EdgeSubHaplotypeFinder(vertex,e.getKey(), e.getValue())); + return result; } - /** - * Process children and initialize structures if not done before. - */ - private void processChildrenIfNeeded() { - if (childrenWereProcessed) return; - long possibleHaplotypeCount = 0; + private static class EdgeSubHaplotypeFinder implements KBestSubHaplotypeFinder { - nextChildrenKBestHaplotypePath = new PriorityQueue<>(children.size()); + private final KBestSubHaplotypeFinder childFinder; - for (final Map.Entry entry : children.entrySet()) { - final KBestSubHaplotypeFinder child = entry.getValue(); - final BaseEdge edge = entry.getKey(); - final int childPossibleHaplotypePathCount = child.getCount(); - if (childPossibleHaplotypePathCount != 0) // paranoia check, should not happen at this point. - nextChildrenKBestHaplotypePath.add(new ChildKBestSubHaplotype(-1,edge,child,0)); - possibleHaplotypeCount += childPossibleHaplotypePathCount; + private final SeqVertex vertex; + + private final BaseEdge edge; + + private EdgeSubHaplotypeFinder(final SeqVertex vertex, final BaseEdge edge, final KBestSubHaplotypeFinder childFinder) { + this.childFinder = childFinder; + this.edge = edge; + this.vertex = vertex; } - // Just make sure we won't incur in overflow here for very large graphs; who is ever going to ask for more than 2G paths!!!) - this.possibleHaplotypeCount = (int) Math.min(Integer.MAX_VALUE,possibleHaplotypeCount); - - // 10 is a bit arbitrary as it is difficult to anticipate what would be the number of requested - // best sub-haplotypes for any node. It shouldn't be too large so that it does not waste space - // but not too small so that there is no need to resize when just a few best solutions are requested. - rankedResults = new ArrayList<>(Math.min(this.possibleHaplotypeCount,10)); - - childrenWereProcessed = true; - } - - @Override - public KBestHaplotype getKBest(int k) { - if (k < 0) - throw new IllegalArgumentException("the rank requested cannot be negative"); - processChildrenIfNeeded(); - if (k >= possibleHaplotypeCount) - throw new IllegalArgumentException("the rank requested cannot be equal or greater to the number of possible haplotypes"); - if (rankedResults.size() > k) - return rankedResults.get(k); - - rankedResults.ensureCapacity(k+1); - for (int i = rankedResults.size(); i <= k; i++) { - // since k < possibleHaplotypeCount is guarantee no to be empty. - if (nextChildrenKBestHaplotypePath.isEmpty()) - throw new IllegalStateException("what the heck " + k + " " + possibleHaplotypeCount); - final ChildKBestSubHaplotype nextResult = nextChildrenKBestHaplotypePath.remove(); - nextResult.rank = i; - rankedResults.add(nextResult); - final int childRank = nextResult.subpath.rank(); - final KBestSubHaplotypeFinder child = nextResult.child; - - // if there is no further solution from the same child we cannot add another solution from that child. - if (childRank + 1 >= nextResult.child.getCount()) - continue; - nextChildrenKBestHaplotypePath.add(new ChildKBestSubHaplotype(-1,nextResult.edge, child, childRank + 1)); + @Override + public int getCount() { + return childFinder.getCount(); + } + + @Override + public KBestHaplotype getKBest(int k) { + return new ChildKBestSubHaplotype(vertex,edge,childFinder.getKBest(k)); } - return rankedResults.get(k); } /** * Custom extension of the {@link KBestHaplotype} used for solutions generated by this class. + * + *

+ * These by delegating on the encapsulated solution from outgoing edge's finder by adding + * the edge score and prefixing this outer finder + * source vertex. + *

*/ - private class ChildKBestSubHaplotype extends KBestHaplotype implements Comparable{ + private static class ChildKBestSubHaplotype extends KBestHaplotype { + private final int score; - private int rank; - private final KBestSubHaplotypeFinder child; - private final BaseEdge edge; - private final KBestHaplotype subpath; + private final KBestHaplotype child; + private final SeqVertex vertex; private final boolean isReference; - public ChildKBestSubHaplotype(final int rank, final BaseEdge edge, - final KBestSubHaplotypeFinder child, final int childRank) { + public ChildKBestSubHaplotype(final SeqVertex vertex, final BaseEdge edge, final KBestHaplotype child) { + this.score = edge.getMultiplicity() + child.score(); + this.vertex = vertex; this.child = child; - this.edge = edge; - this.rank = rank; - this.subpath = child.getKBest(childRank); - this.score = edge.getMultiplicity() + subpath.score(); - this.isReference = edge.isRef() && subpath.isReference(); + this.isReference = edge.isRef() && child.isReference(); } @Override public SeqGraph graph() { - return graph; - } - - @Override - public int compareTo(final ChildKBestSubHaplotype other) { - if (other == null) throw new IllegalArgumentException("the other object cannot be null"); - return - Integer.compare(this.score,other.score); + return child.graph(); } @Override @@ -220,10 +153,9 @@ class RecursiveSubHaplotypeFinder implements KBestSubHaplotypeFinder { @Override public int rank() { - return rank; + return child.rank(); } - @Override protected SeqVertex head() { return vertex; @@ -231,7 +163,7 @@ class RecursiveSubHaplotypeFinder implements KBestSubHaplotypeFinder { @Override protected KBestHaplotype tail() { - return subpath; + return child; } @Override diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index a932f8a96..30b677fe9 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -73,7 +73,6 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { private final boolean dontIncreaseKmerSizesForCycles; private final int numPruningSamples; - private boolean requireReasonableNumberOfPaths = false; protected boolean removePathsNotConnectedToRef = true; private boolean justReturnRawGraph = false; @@ -207,24 +206,12 @@ public class ReadThreadingAssembler extends LocalAssemblyEngine { initialSeqGraph.cleanNonRefPaths(); // TODO -- I don't this is possible by construction final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph); - final AssemblyResult.Status status = cleaned.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION && requireReasonableNumberOfPaths && !reasonableNumberOfPaths(cleaned.getGraph()) ? AssemblyResult.Status.FAILED : cleaned.getStatus(); + final AssemblyResult.Status status = cleaned.getStatus(); final AssemblyResult result = new AssemblyResult(status, cleaned.getGraph()); result.setThreadingGraph(rtgraph); return result; } - /** - * Did we find a reasonable number of paths in this graph? - * @param graph - * @return - */ - private boolean reasonableNumberOfPaths(final SeqGraph graph) { - final KBestPaths pathFinder = new KBestPaths<>(false); - final List> allPaths = pathFinder.getKBestPaths(graph, 100000); - logger.info("Found " + allPaths.size() + " paths through " + graph + " with maximum " + maxAllowedPathsForReadThreadingAssembler); - return allPaths.size() <= maxAllowedPathsForReadThreadingAssembler; - } - @Override public String toString() { return "ReadThreadingAssembler{" + diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java index 63fd21d8f..0ddf7544d 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/CommonSuffixMergerUnitTest.java @@ -137,19 +137,20 @@ public class CommonSuffixMergerUnitTest extends BaseTest { public static void assertSameHaplotypes(final String name, final SeqGraph actual, final SeqGraph original) { try { final Set haplotypes = new HashSet(); - final List> originalPaths = new KBestPaths().getKBestPaths(original); - for ( final Path path : originalPaths ) - haplotypes.add(new String(path.getBases())); + final List originalKBestHaplotypes = new KBestHaplotypeFinder(original,original.getSources(),original.getSinks()); + final List actualKBestHaplotypes = new KBestHaplotypeFinder(actual,actual.getSources(),actual.getSinks()); - final List> splitPaths = new KBestPaths().getKBestPaths(actual); - for ( final Path path : splitPaths ) { - final String h = new String(path.getBases()); + for (final KBestHaplotype kbh : originalKBestHaplotypes) + haplotypes.add(new String(kbh.bases())); + + for ( final KBestHaplotype kbh : actualKBestHaplotypes ) { + final String h = new String(kbh.bases()); Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); } - if ( splitPaths.size() == originalPaths.size() ) { - for ( int i = 0; i < originalPaths.size(); i++ ) { - Assert.assertTrue(splitPaths.get(i).equalSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + if ( actualKBestHaplotypes.size() == originalKBestHaplotypes.size() ) { + for ( int i = 0; i < originalKBestHaplotypes.size(); i++ ) { + Assert.assertTrue(actualKBestHaplotypes.get(i).haplotype().getBaseString().equals(originalKBestHaplotypes.get(i).haplotype().getBaseString()), "Paths not equal " + actualKBestHaplotypes.get(i).haplotype() + " vs. original " + originalKBestHaplotypes.get(i).haplotype()); } } } catch ( AssertionError e ) { diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java similarity index 82% rename from protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java rename to protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java index f117f5750..6dc3d5d67 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestPathsUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/KBestHaplotypeFinderUnitTest.java @@ -66,31 +66,25 @@ import java.util.*; * Date: 1/31/13 */ -public class KBestPathsUnitTest extends BaseTest { - private final static boolean DEBUG = false; +public class KBestHaplotypeFinderUnitTest extends BaseTest { @DataProvider(name = "BasicPathFindingData") public Object[][] makeBasicPathFindingData() { - List tests = new ArrayList(); - for ( final boolean allowCycles : Arrays.asList(false, true)) { - for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { - for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { - for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { - for ( final boolean addCycle : Arrays.asList(true, false) ) { - tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes, addCycle, allowCycles}); - } - } + final List tests = new ArrayList<>(); + for ( final int nStartNodes : Arrays.asList(1, 2, 3) ) { + for ( final int nBranchesPerBubble : Arrays.asList(2, 3) ) { + for ( final int nEndNodes : Arrays.asList(1, 2, 3) ) { + tests.add(new Object[]{nStartNodes, nBranchesPerBubble, nEndNodes}); } } } - return tests.toArray(new Object[][]{}); } private static int weight = 1; final Set createVertices(final SeqGraph graph, final int n, final SeqVertex source, final SeqVertex target) { final List seqs = Arrays.asList("A", "C", "G", "T"); - final Set vertices = new LinkedHashSet(); + final Set vertices = new LinkedHashSet<>(); for ( int i = 0; i < n; i++ ) { final SeqVertex v = new SeqVertex(seqs.get(i)); graph.addVertex(v); @@ -101,77 +95,42 @@ public class KBestPathsUnitTest extends BaseTest { return vertices; } - @Test(dataProvider = "BasicPathFindingData", enabled = !DEBUG) - public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes, final boolean addCycle, final boolean allowCycles) { - SeqGraph graph = new SeqGraph(11); + @Test(dataProvider = "BasicPathFindingData") + public void testBasicPathFinding(final int nStartNodes, final int nBranchesPerBubble, final int nEndNodes) { + final SeqGraph graph = new SeqGraph(11); final SeqVertex middleTop = new SeqVertex("GTAC"); final SeqVertex middleBottom = new SeqVertex("ACTG"); graph.addVertices(middleTop, middleBottom); final Set starts = createVertices(graph, nStartNodes, null, middleTop); + @SuppressWarnings("unused") final Set bubbles = createVertices(graph, nBranchesPerBubble, middleTop, middleBottom); final Set ends = createVertices(graph, nEndNodes, middleBottom, null); - if ( addCycle ) graph.addEdge(middleBottom, middleBottom); - // enumerate all possible paths - final List> paths = new KBestPaths(allowCycles).getKBestPaths(graph, starts, ends); + final List paths = new KBestHaplotypeFinder(graph, starts, ends); - final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * (addCycle && allowCycles ? 2 : 1) * nEndNodes; + final int expectedNumOfPaths = nStartNodes * nBranchesPerBubble * nEndNodes; Assert.assertEquals(paths.size(), expectedNumOfPaths, "Didn't find the expected number of paths"); int lastScore = Integer.MAX_VALUE; - for ( final Path path : paths ) { + for ( final KBestHaplotype kbh : paths ) { + final Path path = kbh.path(); Assert.assertTrue(path.getScore() <= lastScore, "Paths out of order. Path " + path + " has score above previous " + lastScore); lastScore = path.getScore(); } // get the best path, and make sure it's the same as our optimal path overall - final Path best = paths.get(0); - final List> justOne = new KBestPaths(allowCycles).getKBestPaths(graph, 1, starts, ends); + final Path best = paths.get(0).path(); + final List justOne = new KBestHaplotypeFinder(graph,starts, ends).subList(0,1); Assert.assertEquals(justOne.size(), 1); - Assert.assertTrue(justOne.get(0).pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); - } - @Test(enabled = false) // No longer supported, but no longer needed. - public void testPathFindingComplexCycle() { - SeqGraph graph = new SeqGraph(11); - - final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("C"); - final SeqVertex v3 = new SeqVertex("G"); - final SeqVertex v4 = new SeqVertex("T"); - final SeqVertex v5 = new SeqVertex("AA"); - graph.addVertices(v1, v2, v3, v4, v5); - graph.addEdges(v1, v2, v3, v4, v5); - graph.addEdges(v3, v3); - graph.addEdges(v4, v2); - - // enumerate all possible paths - final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v5); - - Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); - } - - @Test(enabled = false) // No longer supported, but no longer needed. - public void testPathFindingCycleLastNode() { - SeqGraph graph = new SeqGraph(11); - - final SeqVertex v1 = new SeqVertex("A"); - final SeqVertex v2 = new SeqVertex("C"); - final SeqVertex v3 = new SeqVertex("G"); - graph.addVertices(v1, v2, v3); - graph.addEdges(v1, v2, v3, v3); - - // enumerate all possible paths - final List> paths = new KBestPaths(false).getKBestPaths(graph, v1, v3); - - Assert.assertEquals(paths.size(), 1, "Didn't find the expected number of paths"); + Assert.assertTrue(justOne.get(0).path().pathsAreTheSame(best), "Best path from complete enumerate " + best + " not the same as from k = 1 search " + justOne.get(0)); } @DataProvider(name = "BasicBubbleDataProvider") public Object[][] makeBasicBubbleDataProvider() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { tests.add(new Object[]{refBubbleLength, altBubbleLength}); @@ -180,7 +139,7 @@ public class KBestPathsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "BasicBubbleDataProvider", enabled = !DEBUG) + @Test(dataProvider = "BasicBubbleDataProvider") public void testBasicBubbleData(final int refBubbleLength, final int altBubbleLength) { // Construct the assembly graph SeqGraph graph = new SeqGraph(3); @@ -202,9 +161,9 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdge(v2Alt, v3, new BaseEdge(false, 5)); // Construct the test path - Path path = new Path(v, graph); - path = new Path(path, graph.getEdge(v, v2Alt)); - path = new Path(path, graph.getEdge(v2Alt, v3)); + Path path = new Path<>(v, graph); + path = new Path<>(path, graph.getEdge(v, v2Alt)); + path = new Path<>(path, graph.getEdge(v2Alt, v3)); // Construct the actual cigar string implied by the test path Cigar expectedCigar = new Cigar(); @@ -226,7 +185,7 @@ public class KBestPathsUnitTest extends BaseTest { @DataProvider(name = "GetBasesData") public Object[][] makeGetBasesData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); final List frags = Arrays.asList("ACT", "GAC", "CAT"); @@ -238,14 +197,14 @@ public class KBestPathsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "GetBasesData", enabled = !DEBUG) + @Test(dataProvider = "GetBasesData") public void testGetBases(final List frags) { // Construct the assembly graph SeqGraph graph = new SeqGraph(3); SeqVertex prev = null; - for ( int i = 0; i < frags.size(); i++ ) { - SeqVertex v = new SeqVertex(frags.get(i)); + for (final String s : frags) { + SeqVertex v = new SeqVertex(s); graph.addVertex(v); if ( prev != null ) graph.addEdge(prev, v); @@ -253,15 +212,15 @@ public class KBestPathsUnitTest extends BaseTest { } // enumerate all possible paths - final List> paths = new KBestPaths().getKBestPaths(graph); + final List paths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks()); Assert.assertEquals(paths.size(), 1); - final Path path = paths.get(0); + final Path path = paths.get(0).path(); Assert.assertEquals(new String(path.getBases()), Utils.join("", frags), "Path doesn't have the expected sequence"); } @DataProvider(name = "TripleBubbleDataProvider") public Object[][] makeTripleBubbleDataProvider() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); for ( final int refBubbleLength : Arrays.asList(1, 5, 10) ) { for ( final int altBubbleLength : Arrays.asList(1, 5, 10) ) { for ( final boolean offRefEnding : Arrays.asList(true, false) ) { @@ -274,7 +233,7 @@ public class KBestPathsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "TripleBubbleDataProvider", enabled = !DEBUG) + @Test(dataProvider = "TripleBubbleDataProvider") public void testTripleBubbleData(final int refBubbleLength, final int altBubbleLength, final boolean offRefBeginning, final boolean offRefEnding) { // Construct the assembly graph SeqGraph graph = new SeqGraph(11); @@ -328,19 +287,17 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdge(v7, postV, new BaseEdge(false, 1)); // Construct the test path - Path path = new Path( (offRefBeginning ? preV : v), graph); - if( offRefBeginning ) { - path = new Path(path, graph.getEdge(preV, v)); - } - path = new Path(path, graph.getEdge(v, v2Alt)); - path = new Path(path, graph.getEdge(v2Alt, v3)); - path = new Path(path, graph.getEdge(v3, v4Ref)); - path = new Path(path, graph.getEdge(v4Ref, v5)); - path = new Path(path, graph.getEdge(v5, v6Alt)); - path = new Path(path, graph.getEdge(v6Alt, v7)); - if( offRefEnding ) { - path = new Path(path, graph.getEdge(v7,postV)); - } + Path path = new Path<>( (offRefBeginning ? preV : v), graph); + if( offRefBeginning ) + path = new Path<>(path, graph.getEdge(preV, v)); + path = new Path<>(path, graph.getEdge(v, v2Alt)); + path = new Path<>(path, graph.getEdge(v2Alt, v3)); + path = new Path<>(path, graph.getEdge(v3, v4Ref)); + path = new Path<>(path, graph.getEdge(v4Ref, v5)); + path = new Path<>(path, graph.getEdge(v5, v6Alt)); + path = new Path<>(path, graph.getEdge(v6Alt, v7)); + if( offRefEnding ) + path = new Path<>(path, graph.getEdge(v7,postV)); // Construct the actual cigar string implied by the test path Cigar expectedCigar = new Cigar(); @@ -382,10 +339,10 @@ public class KBestPathsUnitTest extends BaseTest { "Cigar string mismatch: ref = " + ref + " alt " + new String(path.getBases())); } - @Test(enabled = !DEBUG) + @Test public void testIntraNodeInsertionDeletion() { // Construct the assembly graph - SeqGraph graph = new SeqGraph(11); + final SeqGraph graph = new SeqGraph(11); final SeqVertex top = new SeqVertex("T"); final SeqVertex bot = new SeqVertex("T"); final SeqVertex alt = new SeqVertex("AAACCCCC"); @@ -395,38 +352,38 @@ public class KBestPathsUnitTest extends BaseTest { graph.addEdges(new BaseEdge(true, 1), top, ref, bot); graph.addEdges(new BaseEdge(false, 1), top, alt, bot); - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph, top, bot); + @SuppressWarnings("all") + final KBestHaplotypeFinder bestPathFinder = new KBestHaplotypeFinder(graph,top,bot); - Assert.assertEquals(paths.size(), 2); + Assert.assertEquals(bestPathFinder.size(), 2); - final Path refPath = paths.get(0); - final Path altPath = paths.get(1); + final Path refPath = bestPathFinder.get(0).path(); + final Path altPath = bestPathFinder.get(1).path(); final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); Assert.assertEquals(refPath.calculateCigar(refString.getBytes()).toString(), "10M"); Assert.assertEquals(altPath.calculateCigar(refString.getBytes()).toString(), "1M3I5M3D1M"); } - @Test(enabled = !DEBUG) + @Test public void testHardSWPath() { // Construct the assembly graph - SeqGraph graph = new SeqGraph(11); + final SeqGraph graph = new SeqGraph(11); final SeqVertex top = new SeqVertex( "NNN" ); final SeqVertex bot = new SeqVertex( "NNN" ); - final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); + final SeqVertex alt = new SeqVertex( "ACAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); final SeqVertex ref = new SeqVertex( "TGTGTGTGTGTGTGACAGAGAGAGAGAGAGAGAGAGAGAGAGAGA" ); graph.addVertices(top, bot, alt, ref); graph.addEdges(new BaseEdge(true, 1), top, ref, bot); graph.addEdges(new BaseEdge(false, 1), top, alt, bot); - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph, top, bot); + @SuppressWarnings("all") + final List paths = new KBestHaplotypeFinder(graph, top, bot); Assert.assertEquals(paths.size(), 2); - final Path refPath = paths.get(0); - final Path altPath = paths.get(1); + final Path refPath = paths.get(0).path(); + final Path altPath = paths.get(1).path(); final String refString = top.getSequenceString() + ref.getSequenceString() + bot.getSequenceString(); @@ -446,7 +403,7 @@ public class KBestPathsUnitTest extends BaseTest { @DataProvider(name = "SystematicRefAltSWTestData") public Object[][] makeSystematicRefAltSWTestData() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); final List> allDiffs = Arrays.asList( Arrays.asList("G", "C", "1M"), @@ -470,7 +427,7 @@ public class KBestPathsUnitTest extends BaseTest { return tests.toArray(new Object[][]{}); } - @Test(dataProvider = "SystematicRefAltSWTestData", enabled = !DEBUG) + @Test(dataProvider = "SystematicRefAltSWTestData") public void testRefAltSW(final String prefix, final String end, final String refMid, final String altMid, final String midCigar) { // Construct the assembly graph SeqGraph graph = new SeqGraph(11); @@ -506,7 +463,7 @@ public class KBestPathsUnitTest extends BaseTest { Assert.assertEquals(pathCigar, expected, "Cigar mismatch: ref = " + refString + " vs alt = " + new String(path.getBases())); } - @Test(enabled = !DEBUG) + @Test public void testLeftAlignCigarSequentially() { String preRefString = "GATCGATCGATC"; String postRefString = "TTT"; diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java index bb504b78c..2f44129d8 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/graphs/SharedVertexSequenceSplitterUnitTest.java @@ -61,7 +61,7 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { @DataProvider(name = "PrefixSuffixData") public Object[][] makePrefixSuffixData() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); tests.add(new Object[]{Arrays.asList("A", "C"), 0, 0}); tests.add(new Object[]{Arrays.asList("C", "C"), 1, 0}); @@ -91,7 +91,7 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { @Test(dataProvider = "PrefixSuffixData") public void testPrefixSuffix(final List strings, int expectedPrefixLen, int expectedSuffixLen) { - final List bytes = new ArrayList(); + final List bytes = new ArrayList<>(); int min = Integer.MAX_VALUE; for ( final String s : strings ) { bytes.add(s.getBytes()); @@ -107,7 +107,7 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { @Test(dataProvider = "PrefixSuffixData") public void testPrefixSuffixVertices(final List strings, int expectedPrefixLen, int expectedSuffixLen) { - final List v = new ArrayList(); + final List v = new ArrayList<>(); for ( final String s : strings ) { v.add(new SeqVertex(s)); } @@ -127,19 +127,18 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { public void testSplitter(final List strings, int expectedPrefixLen, int expectedSuffixLen) { final SeqGraph graph = new SeqGraph(11); - final List v = new ArrayList(); + final List v = new ArrayList<>(); for ( final String s : strings ) { v.add(new SeqVertex(s)); } - graph.addVertices(v.toArray(new SeqVertex[]{})); + graph.addVertices(v.toArray(new SeqVertex[v.size()])); final String expectedPrefix = strings.get(0).substring(0, expectedPrefixLen); final String expectedSuffix = strings.get(0).substring(strings.get(0).length() - expectedSuffixLen); final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); splitter.split(); -// splitter.splitGraph.printGraph(new File(Utils.join("_", strings) + ".dot"), 0); Assert.assertEquals(splitter.prefixV.getSequenceString(), expectedPrefix); Assert.assertEquals(splitter.suffixV.getSequenceString(), expectedSuffix); @@ -158,7 +157,7 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { @DataProvider(name = "CompleteCycleData") public Object[][] makeCompleteCycleData() { - List tests = new ArrayList(); + List tests = new ArrayList<>(); for ( final boolean hasTop : Arrays.asList(true, false) ) { for ( final boolean hasBot : Arrays.asList(true, false) ) { @@ -207,11 +206,11 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { int edgeWeight = 1; final SeqVertex top = hasTop ? new SeqVertex("AAAAAAAA") : null; final SeqVertex bot = hasBot ? new SeqVertex("GGGGGGGG") : null; - final List v = new ArrayList(); + final List v = new ArrayList<>(); for ( final String s : strings ) { v.add(new SeqVertex(s)); } - graph.addVertices(v.toArray(new SeqVertex[]{})); + graph.addVertices(v.toArray(new SeqVertex[v.size()])); final SeqVertex first = v.get(0); if ( hasTop ) { @@ -226,10 +225,10 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { graph.addEdge(vi, bot, new BaseEdge(vi == first, edgeWeight++)); } - final Set haplotypes = new HashSet(); - final List> originalPaths = new KBestPaths().getKBestPaths((SeqGraph)graph.clone()); - for ( final Path path : originalPaths ) - haplotypes.add(new String(path.getBases())); + final Set haplotypes = new HashSet<>(); + final List originalPaths = new KBestHaplotypeFinder((SeqGraph) graph.clone(),graph.getSources(),graph.getSinks()); + for ( final KBestHaplotype path : originalPaths ) + haplotypes.add(new String(path.bases())); final SharedVertexSequenceSplitter splitter = new SharedVertexSequenceSplitter(graph, v); splitter.split(); @@ -238,22 +237,22 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { splitter.updateGraph(top, bot); if ( PRINT_GRAPHS ) graph.printGraph(new File(Utils.join("_", strings) + ".updated.dot"), 0); - final List> splitPaths = new KBestPaths().getKBestPaths(graph); - for ( final Path path : splitPaths ) { - final String h = new String(path.getBases()); + final List splitPaths = new KBestHaplotypeFinder(graph,graph.getSources(),graph.getSinks()); + for ( final KBestHaplotype path : splitPaths ) { + final String h = new String(path.bases()); Assert.assertTrue(haplotypes.contains(h), "Failed to find haplotype " + h); } if ( splitPaths.size() == originalPaths.size() ) { for ( int i = 0; i < originalPaths.size(); i++ ) { - Assert.assertTrue(splitPaths.get(i).equalScoreAndSequence(originalPaths.get(i)), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); + Assert.assertTrue(splitPaths.get(i).path().equalScoreAndSequence(originalPaths.get(i).path()), "Paths not equal " + splitPaths.get(i) + " vs. original " + originalPaths.get(i)); } } } @DataProvider(name = "MeetsMinSequenceData") public Object[][] makeMeetsMinSequenceData() { - List tests = new ArrayList(); + final List tests = new ArrayList<>(); final boolean prefixBiased = SharedVertexSequenceSplitter.prefersPrefixMerging(); tests.add(new Object[]{Arrays.asList("AC", "AC"), 0, true, true}); @@ -280,9 +279,9 @@ public class SharedVertexSequenceSplitterUnitTest extends BaseTest { final SeqVertex top = new SeqVertex("AAAAAAAA"); final SeqVertex bot = new SeqVertex("GGGGGGGG"); - final List v = new ArrayList(); + final List v = new ArrayList<>(); for ( final String s : mids ) { v.add(new SeqVertex(s)); } - graph.addVertices(v.toArray(new SeqVertex[]{})); + graph.addVertices(v.toArray(new SeqVertex[v.size()])); graph.addVertices(top, bot); for ( final SeqVertex vi : v ) { graph.addEdge(top, vi); graph.addEdge(vi, bot); } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java index bab952e2a..a13bc4754 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/DanglingChainMergingGraphUnitTest.java @@ -46,10 +46,11 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller.readthreading; -import net.sf.samtools.Cigar; import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.*; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.KBestHaplotype; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.KBestHaplotypeFinder; +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.graphs.SeqGraph; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -57,7 +58,8 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.List; public class DanglingChainMergingGraphUnitTest extends BaseTest { @@ -235,7 +237,7 @@ public class DanglingChainMergingGraphUnitTest extends BaseTest { // confirm that we created the appropriate bubble in the graph only if expected rtgraph.cleanNonRefPaths(); final SeqGraph seqGraph = rtgraph.convertToSequenceGraph(); - List> paths = new KBestPaths().getKBestPaths(seqGraph, seqGraph.getReferenceSourceVertex(), seqGraph.getReferenceSinkVertex()); + final List paths = new KBestHaplotypeFinder(seqGraph, seqGraph.getReferenceSourceVertex(), seqGraph.getReferenceSinkVertex()); Assert.assertEquals(paths.size(), shouldBeMerged ? 2 : 1); } } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java index 5b01a1d85..769026f2b 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingAssemblerUnitTest.java @@ -59,13 +59,14 @@ import java.io.File; import java.util.*; public class ReadThreadingAssemblerUnitTest extends BaseTest { + private final static boolean DEBUG = false; private static class TestAssembler { final ReadThreadingAssembler assembler; Haplotype refHaplotype; - final List reads = new LinkedList(); + final List reads = new LinkedList<>(); private TestAssembler(final int kmerSize) { this.assembler = new ReadThreadingAssembler(100000, Arrays.asList(kmerSize)); @@ -102,11 +103,11 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest { private void assertSingleBubble(final TestAssembler assembler, final String one, final String two) { final SeqGraph graph = assembler.assemble(); graph.simplifyGraph(); - List> paths = new KBestPaths().getKBestPaths(graph); + final List paths = new KBestHaplotypeFinder(graph); Assert.assertEquals(paths.size(), 2); - final Set expected = new HashSet(Arrays.asList(one, two)); - for ( final Path path : paths ) { - final String seq = new String(path.getBases()); + final Set expected = new HashSet<>(Arrays.asList(one, two)); + for ( final KBestHaplotype path : paths ) { + final String seq = new String(path.bases()); Assert.assertTrue(expected.contains(seq)); expected.remove(seq); } @@ -169,7 +170,7 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest { Assert.assertNotNull(graph.getReferenceSourceVertex()); Assert.assertNotNull(graph.getReferenceSinkVertex()); - final List> paths = new KBestPaths().getKBestPaths(graph); + final List paths = new KBestHaplotypeFinder(graph); Assert.assertEquals(paths.size(), 2); } @@ -226,11 +227,10 @@ public class ReadThreadingAssemblerUnitTest extends BaseTest { assembler.addSequence(ReadThreadingGraphUnitTest.getBytes(read2), false); final SeqGraph graph = assembler.assemble(); - final KBestPaths pathFinder = new KBestPaths(); - final List> paths = pathFinder.getKBestPaths(graph); + final List paths = new KBestHaplotypeFinder(graph); Assert.assertEquals(paths.size(), 2); - final byte[] refPath = paths.get(0).getBases().length == ref.length() ? paths.get(0).getBases() : paths.get(1).getBases(); - final byte[] altPath = paths.get(0).getBases().length == ref.length() ? paths.get(1).getBases() : paths.get(0).getBases(); + final byte[] refPath = paths.get(0).bases().length == ref.length() ? paths.get(0).bases() : paths.get(1).bases(); + final byte[] altPath = paths.get(0).bases().length == ref.length() ? paths.get(1).bases() : paths.get(0).bases(); Assert.assertEquals(refPath, ReadThreadingGraphUnitTest.getBytes(ref)); Assert.assertEquals(altPath, ReadThreadingGraphUnitTest.getBytes(read1)); } diff --git a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java index 8535c186a..c95f4002e 100644 --- a/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java +++ b/protected/gatk-protected/src/test/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/readthreading/ReadThreadingGraphUnitTest.java @@ -212,8 +212,7 @@ public class ReadThreadingGraphUnitTest extends BaseTest { rtgraph.buildGraphIfNecessary(); final SeqGraph graph = rtgraph.convertToSequenceGraph(); - final KBestPaths pathFinder = new KBestPaths<>(false); - Assert.assertEquals(pathFinder.getKBestPaths(graph, length, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); + Assert.assertEquals(new KBestHaplotypeFinder(graph, graph.getReferenceSourceVertex(), graph.getReferenceSinkVertex()).size(), 1); } // TODO -- update to use determineKmerSizeAndNonUniques directly diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index a587a3984..df5cf91ca 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -34,7 +34,6 @@ import org.broad.tribble.TribbleException; import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; import org.broad.tribble.util.LittleEndianOutputStream; -import org.broad.tribble.util.TabixUtils; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -172,8 +171,8 @@ public class RMDTrackBuilder { // extends PluginManager { // we might not know the index type, try loading with the default reader constructor logger.debug("Attempting to load " + inputFile + " as a tabix indexed file without validating it"); try { - final File indexFile = new File(inputFile.getAbsoluteFile() + TabixUtils.STANDARD_INDEX_EXTENSION); - final SAMSequenceDictionary dict = TabixUtils.getSequenceDictionary(indexFile); + final File indexFile = null;//new File(inputFile.getAbsoluteFile() + TabixUtils.STANDARD_INDEX_EXTENSION); + final SAMSequenceDictionary dict = null; //TabixUtils.getSequenceDictionary(indexFile); return new Pair<>(AbstractFeatureReader.getFeatureReader(inputFile.getAbsolutePath(), createCodec(descriptor, name)), dict); } catch (TribbleException e) { throw new UserException(e.getMessage(), e); From 58905e8fe034ac8a3f0c682a12ec1a5c10e62325 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 5 Mar 2014 09:15:26 -0500 Subject: [PATCH 14/18] Disable the intermittently-failing and flawed ProgressMeterDaemonUnitTest -created a Pivotal ticket to eventually redesign this test --- .../sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java index 767646963..2ede67a3c 100644 --- a/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java +++ b/public/gatk-framework/src/test/java/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java @@ -92,7 +92,7 @@ public class ProgressMeterDaemonUnitTest extends BaseTest { Assert.assertTrue( meter.getRuntimeInNanosecondsUpdatedPeriodically() > currentTime, "Updating the periodic runtime failed" ); } - @Test(dataProvider = "PollingData", invocationCount = 10, successPercentage = 90) + @Test(dataProvider = "PollingData", invocationCount = 10, successPercentage = 90, enabled = false) public void testProgressMeterDaemon(final long poll, final int ticks) throws InterruptedException { final TestingProgressMeter meter = new TestingProgressMeter(poll); final ProgressMeterDaemon daemon = meter.getProgressMeterDaemon(); From b4dde6a78c6a3527dd7f80392e10b36ef1b2ca3d Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 27 Feb 2014 10:07:38 -0500 Subject: [PATCH 15/18] Add WARN to the valid log types error message - order if statements and error message in increasing severity --- .../sting/commandline/CommandLineProgram.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java index 8c7e11f35..8b1a390f4 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -277,19 +277,19 @@ public abstract class CommandLineProgram { Level par; if (logging_level.toUpperCase().equals("DEBUG")) { par = Level.DEBUG; - } else if (logging_level.toUpperCase().equals("ERROR")) { - par = Level.ERROR; - } else if (logging_level.toUpperCase().equals("FATAL")) { - par = Level.FATAL; } else if (logging_level.toUpperCase().equals("INFO")) { par = Level.INFO; } else if (logging_level.toUpperCase().equals("WARN")) { par = Level.WARN; + } else if (logging_level.toUpperCase().equals("ERROR")) { + par = Level.ERROR; + } else if (logging_level.toUpperCase().equals("FATAL")) { + par = Level.FATAL; } else if (logging_level.toUpperCase().equals("OFF")) { par = Level.OFF; } else { // we don't understand the logging level, let's get out of here - throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (INFO, DEBUG, ERROR, FATAL, OFF)"); + throw new ArgumentException("Unable to match: " + logging_level + " to a logging level, make sure it's a valid level (DEBUG, INFO, WARN, ERROR, FATAL, OFF)"); } Logger.getRootLogger().setLevel(par); From 57747ad35e3154004ad6f27df5337366ac1942ec Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Tue, 4 Mar 2014 19:40:08 -0500 Subject: [PATCH 16/18] Logger output should go to STDERR instead of STDOUT --- .../org/broadinstitute/sting/commandline/CommandLineUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java index ddedda054..cb9a781c3 100644 --- a/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java +++ b/public/gatk-framework/src/main/java/org/broadinstitute/sting/commandline/CommandLineUtils.java @@ -173,7 +173,7 @@ public class CommandLineUtils { } // Extracted from BasicConfigurator.configure(), but only applied to the Sting logger. Logger.getRootLogger().addAppender(new ConsoleAppender( - new PatternLayout(PatternLayout.TTCC_CONVERSION_PATTERN))); + new PatternLayout(PatternLayout.TTCC_CONVERSION_PATTERN), ConsoleAppender.SYSTEM_ERR)); } /** From d81116eb1d75a03b9c535a39caed5a3a4d91063c Mon Sep 17 00:00:00 2001 From: Intel Repocontact Date: Wed, 5 Mar 2014 09:30:29 -0800 Subject: [PATCH 18/18] Added vectorized PairHMM implementation by Mohammad and Mustafa into the Maven build of GATK. C++ code has PAPI calls for reading hardware counters Followed Khalid's suggestion for packing libVectorLoglessCaching into the jar file with Maven Native library part of git repo 1. Renamed directory structure from public/c++/VectorPairHMM to public/VectorPairHMM/src/main/c++ as per Khalid's suggestion 2. Use java.home in public/VectorPairHMM/pom.xml to pass environment variable JRE_HOME to the make process. This is needed because the Makefile needs to compile JNI code with the flag -I/../include (among others). Assuming that the Maven build process uses a JDK (and not just a JRE), the variable java.home points to the JRE inside maven. 3. Dropped all pretense at cross-platform compatibility. Removed Mac profile from pom.xml for VectorPairHMM Moved JNI_README 1. Added the catch UnsatisfiedLinkError exception in PairHMMLikelihoodCalculationEngine.java to fall back to LOGLESS_CACHING in case the native library could not be loaded. Made VECTOR_LOGLESS_CACHING as the default implementation. 2. Updated the README with Mauricio's comments 3. baseline.cc is used within the library - if the machine supports neither AVX nor SSE4.1, the native library falls back to un-vectorized C++ in baseline.cc. 4. pairhmm-1-base.cc: This is not part of the library, but is being heavily used for debugging/profiling. Can I request that we keep it there for now? In the next release, we can delete it from the repository. 5. I agree with Mauricio about the ifdefs. I am sure you already know, but just to reassure you the debug code is not compiled into the library (because of the ifdefs) and will not affect performance. 1. Changed logger.info to logger.warn in PairHMMLikelihoodCalculationEngine.java 2. Committing the right set of files after rebase Added public license text to all C++ files Added license to Makefile Add package info to Sandbox.java Conflicts: protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/DebugJNILoglessPairHMM.java protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java public/VectorPairHMM/src/main/c++/.gitignore public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc public/VectorPairHMM/src/main/c++/LoadTimeInitializer.h public/VectorPairHMM/src/main/c++/Makefile public/VectorPairHMM/src/main/c++/Sandbox.cc public/VectorPairHMM/src/main/c++/Sandbox.h public/VectorPairHMM/src/main/c++/Sandbox.java public/VectorPairHMM/src/main/c++/Sandbox_JNIHaplotypeDataHolderClass.h public/VectorPairHMM/src/main/c++/Sandbox_JNIReadDataHolderClass.h public/VectorPairHMM/src/main/c++/baseline.cc public/VectorPairHMM/src/main/c++/define-double.h public/VectorPairHMM/src/main/c++/define-float.h public/VectorPairHMM/src/main/c++/define-sse-double.h public/VectorPairHMM/src/main/c++/define-sse-float.h public/VectorPairHMM/src/main/c++/headers.h public/VectorPairHMM/src/main/c++/jnidebug.h public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.h public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h public/VectorPairHMM/src/main/c++/pairhmm-template-kernel.cc public/VectorPairHMM/src/main/c++/pairhmm-template-main.cc public/VectorPairHMM/src/main/c++/run.sh public/VectorPairHMM/src/main/c++/shift_template.c public/VectorPairHMM/src/main/c++/utils.cc public/VectorPairHMM/src/main/c++/utils.h public/VectorPairHMM/src/main/c++/vector_function_prototypes.h --- ...GraphBasedLikelihoodCalculationEngine.java | 4 + .../haplotypecaller/HaplotypeCaller.java | 4 +- .../PairHMMLikelihoodCalculationEngine.java | 42 +- .../RandomLikelihoodCalculationEngine.java | 5 + .../utils/pairhmm/DebugJNILoglessPairHMM.java | 517 ++++++++++++++++++ .../utils/pairhmm/JNILoglessPairHMM.java | 63 +++ .../utils/pairhmm/VectorLoglessPairHMM.java | 335 ++++++++++++ public/VectorPairHMM/README.md | 71 +++ public/VectorPairHMM/pom.xml | 119 ++++ public/VectorPairHMM/src/main/c++/.gitignore | 16 + .../src/main/c++/LoadTimeInitializer.cc | 206 +++++++ .../src/main/c++/LoadTimeInitializer.h | 94 ++++ public/VectorPairHMM/src/main/c++/Makefile | 114 ++++ public/VectorPairHMM/src/main/c++/Sandbox.cc | 106 ++++ public/VectorPairHMM/src/main/c++/Sandbox.h | 96 ++++ .../VectorPairHMM/src/main/c++/Sandbox.java | 305 +++++++++++ .../c++/Sandbox_JNIHaplotypeDataHolderClass.h | 13 + .../main/c++/Sandbox_JNIReadDataHolderClass.h | 13 + .../main/c++/avx_function_instantiations.cc | 44 ++ public/VectorPairHMM/src/main/c++/baseline.cc | 167 ++++++ .../src/main/c++/define-double.h | 205 +++++++ .../VectorPairHMM/src/main/c++/define-float.h | 206 +++++++ .../src/main/c++/define-sse-double.h | 173 ++++++ .../src/main/c++/define-sse-float.h | 173 ++++++ public/VectorPairHMM/src/main/c++/headers.h | 71 +++ .../VectorPairHMM/src/main/c++/jni_common.h | 58 ++ public/VectorPairHMM/src/main/c++/jnidebug.h | 191 +++++++ ...ng_utils_pairhmm_DebugJNILoglessPairHMM.cc | 176 ++++++ ...ing_utils_pairhmm_DebugJNILoglessPairHMM.h | 96 ++++ ...ting_utils_pairhmm_VectorLoglessPairHMM.cc | 382 +++++++++++++ ...sting_utils_pairhmm_VectorLoglessPairHMM.h | 104 ++++ .../src/main/c++/pairhmm-1-base.cc | 70 +++ .../src/main/c++/pairhmm-template-kernel.cc | 380 +++++++++++++ .../src/main/c++/pairhmm-template-main.cc | 114 ++++ public/VectorPairHMM/src/main/c++/run.sh | 32 ++ .../src/main/c++/shift_template.c | 113 ++++ .../main/c++/sse_function_instantiations.cc | 43 ++ public/VectorPairHMM/src/main/c++/template.h | 320 +++++++++++ public/VectorPairHMM/src/main/c++/utils.cc | 493 +++++++++++++++++ public/VectorPairHMM/src/main/c++/utils.h | 75 +++ .../VectorPairHMM/src/main/c++/vector_defs.h | 55 ++ .../src/main/c++/vector_function_prototypes.h | 44 ++ public/sting-root/pom.xml | 6 +- .../utils/pairhmm/libVectorLoglessPairHMM.so | Bin 0 -> 443803 bytes 44 files changed, 5901 insertions(+), 13 deletions(-) create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/DebugJNILoglessPairHMM.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java create mode 100644 protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java create mode 100644 public/VectorPairHMM/README.md create mode 100644 public/VectorPairHMM/pom.xml create mode 100644 public/VectorPairHMM/src/main/c++/.gitignore create mode 100644 public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc create mode 100644 public/VectorPairHMM/src/main/c++/LoadTimeInitializer.h create mode 100644 public/VectorPairHMM/src/main/c++/Makefile create mode 100644 public/VectorPairHMM/src/main/c++/Sandbox.cc create mode 100644 public/VectorPairHMM/src/main/c++/Sandbox.h create mode 100644 public/VectorPairHMM/src/main/c++/Sandbox.java create mode 100644 public/VectorPairHMM/src/main/c++/Sandbox_JNIHaplotypeDataHolderClass.h create mode 100644 public/VectorPairHMM/src/main/c++/Sandbox_JNIReadDataHolderClass.h create mode 100644 public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc create mode 100644 public/VectorPairHMM/src/main/c++/baseline.cc create mode 100644 public/VectorPairHMM/src/main/c++/define-double.h create mode 100644 public/VectorPairHMM/src/main/c++/define-float.h create mode 100644 public/VectorPairHMM/src/main/c++/define-sse-double.h create mode 100644 public/VectorPairHMM/src/main/c++/define-sse-float.h create mode 100644 public/VectorPairHMM/src/main/c++/headers.h create mode 100644 public/VectorPairHMM/src/main/c++/jni_common.h create mode 100644 public/VectorPairHMM/src/main/c++/jnidebug.h create mode 100644 public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc create mode 100644 public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.h create mode 100644 public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc create mode 100644 public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h create mode 100644 public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc create mode 100644 public/VectorPairHMM/src/main/c++/pairhmm-template-kernel.cc create mode 100644 public/VectorPairHMM/src/main/c++/pairhmm-template-main.cc create mode 100755 public/VectorPairHMM/src/main/c++/run.sh create mode 100644 public/VectorPairHMM/src/main/c++/shift_template.c create mode 100644 public/VectorPairHMM/src/main/c++/sse_function_instantiations.cc create mode 100644 public/VectorPairHMM/src/main/c++/template.h create mode 100644 public/VectorPairHMM/src/main/c++/utils.cc create mode 100644 public/VectorPairHMM/src/main/c++/utils.h create mode 100644 public/VectorPairHMM/src/main/c++/vector_defs.h create mode 100644 public/VectorPairHMM/src/main/c++/vector_function_prototypes.h create mode 100644 public/sting-utils/src/main/resources/org/broadinstitute/sting/utils/pairhmm/libVectorLoglessPairHMM.so diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java index 8a35ccb05..8b37e265d 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GraphBasedLikelihoodCalculationEngine.java @@ -155,4 +155,8 @@ public class GraphBasedLikelihoodCalculationEngine implements LikelihoodCalculat } } } + + @Override + public void close() { + } } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 91e763a0d..33dfa54ce 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -433,7 +433,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Hidden @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false) - public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING; + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING; @Hidden @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) @@ -1051,7 +1051,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In referenceConfidenceModel.close(); //TODO remove the need to call close here for debugging, the likelihood output stream should be managed //TODO (open & close) at the walker, not the engine. - //likelihoodCalculationEngine.close(); + likelihoodCalculationEngine.close(); logger.info("Ran local assembly on " + result + " active regions"); } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java index 55a1c5dba..7a526f32e 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/PairHMMLikelihoodCalculationEngine.java @@ -86,17 +86,29 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation case EXACT: return new Log10PairHMM(true); case ORIGINAL: return new Log10PairHMM(false); case LOGLESS_CACHING: - if (noFpga || !CnyPairHMM.isAvailable()) - return new LoglessPairHMM(); - else - return new CnyPairHMM(); + if (noFpga || !CnyPairHMM.isAvailable()) + return new LoglessPairHMM(); + else + return new CnyPairHMM(); + case VECTOR_LOGLESS_CACHING: + try + { + return new VectorLoglessPairHMM(); + } + catch(UnsatisfiedLinkError ule) + { + logger.warn("Failed to load native library for VectorLoglessPairHMM - using Java implementation of LOGLESS_CACHING"); + return new LoglessPairHMM(); + } + case DEBUG_VECTOR_LOGLESS_CACHING: + return new DebugJNILoglessPairHMM(PairHMM.HMM_IMPLEMENTATION.VECTOR_LOGLESS_CACHING); case ARRAY_LOGLESS: - if (noFpga || !CnyPairHMM.isAvailable()) - return new ArrayLoglessPairHMM(); - else - return new CnyPairHMM(); + if (noFpga || !CnyPairHMM.isAvailable()) + return new ArrayLoglessPairHMM(); + else + return new CnyPairHMM(); default: - throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, LOGLESS_CACHING, and ARRAY_LOGLESS."); + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, LOGLESS_CACHING, and ARRAY_LOGLESS."); } } }; @@ -164,8 +176,10 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation public void close() { if ( likelihoodsStream != null ) likelihoodsStream.close(); + pairHMMThreadLocal.get().close(); } + private void writeDebugLikelihoods(final GATKSAMRecord processedRead, final Haplotype haplotype, final double log10l){ if ( WRITE_LIKELIHOODS_TO_FILE ) { likelihoodsStream.printf("%s %s %s %s %s %s %f%n", @@ -327,7 +341,13 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation } // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases - pairHMMThreadLocal.get().initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + pairHMMThreadLocal.get().initialize(haplotypes, perSampleReadList, X_METRIC_LENGTH, Y_METRIC_LENGTH); + } + + private void finalizePairHMM() + { + pairHMMThreadLocal.get().finalizeRegion(); +>>>>>>> d968ca6... Added vectorized PairHMM implementation by Mohammad and Mustafa into the Maven build of GATK. } @@ -347,6 +367,8 @@ public class PairHMMLikelihoodCalculationEngine implements LikelihoodCalculation map.filterPoorlyModelledReads(EXPECTED_ERROR_RATE_PER_BASE); stratifiedReadMap.put(sampleEntry.getKey(), map); } + //Used mostly by the JNI implementation(s) to free arrays + finalizePairHMM(); return stratifiedReadMap; } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java index b8dba7b86..d5d424ca9 100644 --- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/haplotypecaller/RandomLikelihoodCalculationEngine.java @@ -79,4 +79,9 @@ public class RandomLikelihoodCalculationEngine implements LikelihoodCalculationE return result; } + + @Override + public void close() { + } + } diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/DebugJNILoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/DebugJNILoglessPairHMM.java new file mode 100644 index 000000000..ea93ebe4a --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/DebugJNILoglessPairHMM.java @@ -0,0 +1,517 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; + +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.sting.utils.exceptions.UserException; +import static org.broadinstitute.sting.utils.pairhmm.PairHMMModel.*; + +import java.util.List; +import java.util.Map; +import java.util.HashMap; +import java.io.File; +import java.io.FileWriter; +import java.io.BufferedWriter; +import java.util.Map; +import java.util.HashMap; +import java.io.IOException; + + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ +public class DebugJNILoglessPairHMM extends LoglessPairHMM { + + private static final boolean dumpSandboxOnly = false; //simulates ifdef + private static final boolean debug = false; //simulates ifdef + private static final boolean verify = !dumpSandboxOnly && (debug || true); //simulates ifdef + private static final boolean debug0_1 = false; //simulates ifdef + private static final boolean debug1 = false; //simulates ifdef + private static final boolean debug2 = false; + private static final boolean debug3 = false; + + //Debugging stats + private int numCalls = 0; + private int numComputeLikelihoodCalls = 0; + protected HashMap filenameToWriter = new HashMap(); + + private JNILoglessPairHMM jniPairHMM = null; + public DebugJNILoglessPairHMM(final PairHMM.HMM_IMPLEMENTATION hmmType) { + super(); + switch(hmmType) { + case VECTOR_LOGLESS_CACHING: + jniPairHMM = new VectorLoglessPairHMM(); + break; + default: + throw new UserException.BadArgumentValue("pairHMM","Specified JNIPairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are VECTOR_LOGLESS_CACHING"); + } + } + + @Override + public void close() + { + jniPairHMM.close(); + debugClose(); + } + + //Used only when testing parts of the compute kernel + /** + * {@inheritDoc} + */ + @Override + public void initialize( final int readMaxLength, final int haplotypeMaxLength ) { + if(verify) + super.initialize(readMaxLength, haplotypeMaxLength); + if(debug3) + { + System.out.println("Java: alloc initialized readMaxLength : "+readMaxLength+" haplotypeMaxLength : "+haplotypeMaxLength); + debugDump("lengths_java.txt", String.format("%d %d\n",readMaxLength, haplotypeMaxLength), + true); + } + if(debug2) + jniInitialize(readMaxLength, haplotypeMaxLength); + } + + private HashMap haplotypeToHaplotypeListIdxMap = null; + //Used to transfer data to JNI + //Since the haplotypes are the same for all calls to computeLikelihoods within a region, transfer the haplotypes only once to the JNI per region + /** + * {@inheritDoc} + */ + @Override + public void initialize( final List haplotypes, final Map> perSampleReadList, + final int readMaxLength, final int haplotypeMaxLength ) { + if(verify) + { + super.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength); + jniPairHMM.initialize(haplotypes, perSampleReadList, readMaxLength, haplotypeMaxLength); + haplotypeToHaplotypeListIdxMap = jniPairHMM.getHaplotypeToHaplotypeListIdxMap(); + } + } + + /** + * {@inheritDoc} + */ + @Override + public void finalizeRegion() + { + if(!dumpSandboxOnly) + jniPairHMM.finalizeRegion(); + } + + /** + * {@inheritDoc} + */ + @Override + public PerReadAlleleLikelihoodMap computeLikelihoods( final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap ) { + // (re)initialize the pairHMM only if necessary + final int readMaxLength = verify ? findMaxReadLength(reads) : 0; + final int haplotypeMaxLength = verify ? findMaxHaplotypeLength(alleleHaplotypeMap) : 0; + if(verify) + { + if (!initialized || readMaxLength > maxReadLength || haplotypeMaxLength > maxHaplotypeLength) + { initialize(readMaxLength, haplotypeMaxLength); } + if ( ! initialized ) + throw new IllegalStateException("Must call initialize before calling jniComputeLikelihoods in debug/verify mode"); + } + int readListSize = reads.size(); + int numHaplotypes = alleleHaplotypeMap.size(); + int numTestcases = readListSize*numHaplotypes; + if(debug0_1) + System.out.println("Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); + int idx = 0; + for(GATKSAMRecord read : reads) + { + byte [] overallGCP = GCPArrayMap.get(read); + if(debug0_1) + System.out.println("Java read length "+read.getReadBases().length); + if(debug3) + { + for(int i=0;i currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always + { + byte[] haplotypeBases = currEntry.getValue().getBases(); + if(debug0_1) + System.out.println("Java haplotype length "+haplotypeBases.length); + if(debug3) + { + for(int i=0;i currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always + { + idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); + likelihoodArray[idx] = tmpArray[idxInsideHaplotypeList]; + ++idx; + } + readIdx += numHaplotypes; + } + //for floating point values, no exact equality + //check whether numbers are close in terms of abs_error or relative_error + //For very large values, relative_error is relevant + //For very small values, abs_error is relevant + for(int i=0;i 1e-5 && relative_error > 1e-5) + { + toDump = true; + break; + } + } + } + //if numbers are not close, then dump out the data that produced the inconsistency + if(toDump) + { + idx = 0; + System.out.println("Dump : Java numReads "+readListSize+" numHaplotypes "+numHaplotypes); + boolean firstLine = true; + for(GATKSAMRecord read : reads) + { + byte [] overallGCP = GCPArrayMap.get(read); + byte[] tmpByteArray = new byte[read.getReadBases().length]; + for (Map.Entry currEntry : alleleHaplotypeMap.entrySet()) //order is important - access in same order always + { + byte[] haplotypeBases = currEntry.getValue().getBases(); + debugDump("debug_dump.txt",new String(haplotypeBases)+" ",true); + debugDump("debug_dump.txt",new String(read.getReadBases())+" ",true); + for(int k=0;k currEntry : filenameToWriter.entrySet()) { + BufferedWriter currWriter = currEntry.getValue(); + try + { + currWriter.flush(); + currWriter.close(); + } + catch(IOException e) + { + e.printStackTrace(); + + } + } + filenameToWriter.clear(); + } +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java new file mode 100644 index 000000000..f039cc295 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/JNILoglessPairHMM.java @@ -0,0 +1,63 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.haplotype.Haplotype; + +import java.util.HashMap; + + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ +public abstract class JNILoglessPairHMM extends LoglessPairHMM { + public abstract HashMap getHaplotypeToHaplotypeListIdxMap(); + protected long setupTime = 0; + +} diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java new file mode 100644 index 000000000..a32768c20 --- /dev/null +++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM.java @@ -0,0 +1,335 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; + +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.sting.utils.haplotype.Haplotype; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.variant.variantcontext.Allele; + +import java.util.List; +import java.util.Map; +import java.util.HashMap; + +//For loading library from jar +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin, carneiro + * Date: 10/16/12 + */ +public class VectorLoglessPairHMM extends JNILoglessPairHMM { + + //For machine capabilities + public static final long sse41Mask = 1; + public static final long sse42Mask = 2; + public static final long avxMask = 4; + public static final long enableAll = 0xFFFFFFFFFFFFFFFFl; + + //Used to copy references to byteArrays to JNI from reads + protected class JNIReadDataHolderClass { + public byte[] readBases = null; + public byte[] readQuals = null; + public byte[] insertionGOP = null; + public byte[] deletionGOP = null; + public byte[] overallGCP = null; + } + + //Used to copy references to byteArrays to JNI from haplotypes + protected class JNIHaplotypeDataHolderClass { + public byte[] haplotypeBases = null; + } + + /** + * Return 64-bit mask representing machine capabilities + * Bit 0 is LSB, bit 63 MSB + * Bit 0 represents sse4.1 availability + * Bit 1 represents sse4.2 availability + * Bit 2 represents AVX availability + */ + public native long jniGetMachineType(); + + /** + * Function to initialize the fields of JNIReadDataHolderClass and JNIHaplotypeDataHolderClass from JVM. + * C++ codegets FieldIDs for these classes once and re-uses these IDs for the remainder of the program. Field IDs do not + * change per JVM session + * @param readDataHolderClass class type of JNIReadDataHolderClass + * @param haplotypeDataHolderClass class type of JNIHaplotypeDataHolderClass + * @param mask mask is a 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing some bits in the mask + * */ + private native void jniInitializeClassFieldsAndMachineMask(Class readDataHolderClass, Class haplotypeDataHolderClass, long mask); + + private static Boolean isVectorLoglessPairHMMLibraryLoaded = false; + //The constructor is called only once inside PairHMMLikelihoodCalculationEngine + public VectorLoglessPairHMM() { + super(); + synchronized(isVectorLoglessPairHMMLibraryLoaded) { + //Load the library and initialize the FieldIDs + if(!isVectorLoglessPairHMMLibraryLoaded) { + try + { + //Try loading from Java's library path first + //Useful if someone builds his/her own library and wants to override the bundled + //implementation without modifying the Java code + System.loadLibrary("VectorLoglessPairHMM"); + logger.info("libVectorLoglessPairHMM found in JVM library path"); + } + catch(UnsatisfiedLinkError ule) + { + //Could not load from Java's library path - try unpacking from jar + try + { + logger.info("libVectorLoglessPairHMM not found in JVM library path - trying to unpack from StingUtils.jar"); + loadLibraryFromJar("/org/broadinstitute/sting/utils/pairhmm/libVectorLoglessPairHMM.so"); + logger.info("libVectorLoglessPairHMM unpacked successfully from StingUtils.jar"); + } + catch(IOException ioe) + { + //Throw the UnsatisfiedLinkError to make it clear to the user what failed + throw ule; + } + } + + isVectorLoglessPairHMMLibraryLoaded = true; + jniInitializeClassFieldsAndMachineMask(JNIReadDataHolderClass.class, JNIHaplotypeDataHolderClass.class, enableAll); //need to do this only once + } + } + } + + private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray); + //Hold the mapping between haplotype and index in the list of Haplotypes passed to initialize + //Use this mapping in computeLikelihoods to find the likelihood value corresponding to a given Haplotype + private HashMap haplotypeToHaplotypeListIdxMap = new HashMap(); + @Override + public HashMap getHaplotypeToHaplotypeListIdxMap() { return haplotypeToHaplotypeListIdxMap; } + + //Used to transfer data to JNI + //Since the haplotypes are the same for all calls to computeLikelihoods within a region, transfer the haplotypes only once to the JNI per region + /** + * {@inheritDoc} + */ + @Override + public void initialize( final List haplotypes, final Map> perSampleReadList, + final int readMaxLength, final int haplotypeMaxLength ) { + int numHaplotypes = haplotypes.size(); + JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes]; + int idx = 0; + haplotypeToHaplotypeListIdxMap.clear(); + for(final Haplotype currHaplotype : haplotypes) + { + haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass(); + haplotypeDataArray[idx].haplotypeBases = currHaplotype.getBases(); + haplotypeToHaplotypeListIdxMap.put(currHaplotype, idx); + ++idx; + } + jniInitializeHaplotypes(numHaplotypes, haplotypeDataArray); + } + /** + * Tell JNI to release arrays - really important if native code is directly accessing Java memory, if not + * accessing Java memory directly, still important to release memory from C++ + */ + private native void jniFinalizeRegion(); + + /** + * {@inheritDoc} + */ + @Override + public void finalizeRegion() + { + jniFinalizeRegion(); + } + + /** + * Real compute kernel + */ + private native void jniComputeLikelihoods(int numReads, int numHaplotypes, JNIReadDataHolderClass[] readDataArray, + JNIHaplotypeDataHolderClass[] haplotypeDataArray, double[] likelihoodArray, int maxNumThreadsToUse); + /** + * {@inheritDoc} + */ + @Override + public PerReadAlleleLikelihoodMap computeLikelihoods( final List reads, final Map alleleHaplotypeMap, final Map GCPArrayMap ) { + if(doProfiling) + startTime = System.nanoTime(); + int readListSize = reads.size(); + int numHaplotypes = alleleHaplotypeMap.size(); + int numTestcases = readListSize*numHaplotypes; + JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize]; + int idx = 0; + for(GATKSAMRecord read : reads) + { + readDataArray[idx] = new JNIReadDataHolderClass(); + readDataArray[idx].readBases = read.getReadBases(); + readDataArray[idx].readQuals = read.getBaseQualities(); + readDataArray[idx].insertionGOP = read.getBaseInsertionQualities(); + readDataArray[idx].deletionGOP = read.getBaseDeletionQualities(); + readDataArray[idx].overallGCP = GCPArrayMap.get(read); + ++idx; + } + + mLikelihoodArray = new double[readListSize*numHaplotypes]; //to store results + if(doProfiling) + setupTime += (System.nanoTime() - startTime); + //for(reads) + // for(haplotypes) + // compute_full_prob() + jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, null, mLikelihoodArray, 12); + + final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap(); + idx = 0; + int idxInsideHaplotypeList = 0; + int readIdx = 0; + for(GATKSAMRecord read : reads) + { + for (Map.Entry currEntry : alleleHaplotypeMap.entrySet())//order is important - access in same order always + { + //Since the order of haplotypes in the List and alleleHaplotypeMap is different, + //get idx of current haplotype in the list and use this idx to get the right likelihoodValue + idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(currEntry.getValue()); + likelihoodMap.add(read, currEntry.getKey(), mLikelihoodArray[readIdx + idxInsideHaplotypeList]); + ++idx; + } + readIdx += numHaplotypes; + } + if(doProfiling) + computeTime += (System.nanoTime() - startTime); + return likelihoodMap; + } + + /** + * Print final profiling information from native code + */ + public native void jniClose(); + @Override + public void close() + { + System.out.println("Time spent in setup for JNI call : "+(setupTime*1e-9)); + super.close(); + jniClose(); + } + + //Copied from http://frommyplayground.com/how-to-load-native-jni-library-from-jar + /** + * Loads library from current JAR archive + * + * The file from JAR is copied into system temporary directory and then loaded. The temporary file is deleted after exiting. + * Method uses String as filename because the pathname is "abstract", not system-dependent. + * + * @param filename The filename inside JAR as absolute path (beginning with '/'), e.g. /package/File.ext + * @throws IOException If temporary file creation or read/write operation fails + * @throws IllegalArgumentException If source file (param path) does not exist + * @throws IllegalArgumentException If the path is not absolute or if the filename is shorter than three characters (restriction of {@see File#createTempFile(java.lang.String, java.lang.String)}). + */ + public static void loadLibraryFromJar(String path) throws IOException { + + if (!path.startsWith("/")) { + throw new IllegalArgumentException("The path to be absolute (start with '/')."); + } + + // Obtain filename from path + String[] parts = path.split("/"); + String filename = (parts.length > 1) ? parts[parts.length - 1] : null; + + // Split filename to prexif and suffix (extension) + String prefix = ""; + String suffix = null; + if (filename != null) { + parts = filename.split("\\.", 2); + prefix = parts[0]; + suffix = (parts.length > 1) ? "."+parts[parts.length - 1] : null; // Thanks, davs! :-) + } + + // Check if the filename is okay + if (filename == null || prefix.length() < 3) { + throw new IllegalArgumentException("The filename has to be at least 3 characters long."); + } + + // Prepare temporary file + File temp = File.createTempFile(prefix, suffix); + //System.out.println("Temp lib file "+temp.getAbsolutePath()); + temp.deleteOnExit(); + + if (!temp.exists()) { + throw new FileNotFoundException("File " + temp.getAbsolutePath() + " does not exist."); + } + + // Prepare buffer for data copying + byte[] buffer = new byte[1024]; + int readBytes; + + // Open and check input stream + InputStream is = VectorLoglessPairHMM.class.getResourceAsStream(path); + if (is == null) { + throw new FileNotFoundException("File " + path + " was not found inside JAR."); + } + + // Open output stream and copy data between source file in JAR and the temporary file + OutputStream os = new FileOutputStream(temp); + try { + while ((readBytes = is.read(buffer)) != -1) { + os.write(buffer, 0, readBytes); + } + } finally { + // If read/write fails, close streams safely before throwing an exception + os.close(); + is.close(); + } + + // Finally, load the library + System.load(temp.getAbsolutePath()); + } +} diff --git a/public/VectorPairHMM/README.md b/public/VectorPairHMM/README.md new file mode 100644 index 000000000..85cc0a04a --- /dev/null +++ b/public/VectorPairHMM/README.md @@ -0,0 +1,71 @@ +Implementation overview: +Created a new Java class called VectorLoglessPairHMM which extends LoglessPairHMM and +overrides functions from both LoglessPairHMM and PairHMM. +1. Constructor: Call base class constructors. Then, load the native library located in this +directory and call an init function (with suffix 'jniInitializeClassFieldsAndMachineMask') in the +library to determine fields ids for the members of classes JNIReadDataHolder and +JNIHaplotypeDataHolders. The native code stores the field ids (struct offsets) for the classes and +re-uses them for subsequent computations. Optionally, the user can disable the vector +implementation, by using the 'mask' argument (see comments for a more detailed explanation). +2. When the library is loaded, it invokes the constructor of the class LoadTimeInitializer (because +a global variable g_load_time_initializer is declared in the library). This constructor +(LoadTimeInitializer.cc) can be used to perform various initializations. Currently, it initializes +two global function pointers to point to the function implementation that is supported on the +machine (AVX/SSE/un-vectorized) on which the program is being run. The two pointers are for float +and double respectively. The global function pointers are declared in utils.cc and are assigned in +the function initialize_function_pointers() defined in utils.cc and invoked from the constructor of +LoadTimeInitializer. +Other initializations in LoadTimeInitializer: +* ConvertChar::init - sets some masks for the vector implementation +* FTZ for performance +* stat counters = 0 +* debug structs (which are never used in non-debug mode) +This initialization is done only once for the whole program. +3. initialize(): To initialize the region for PairHMM. Pass haplotype bases to native code through +the JNIHaplotypeDataHolder class. Since the haplotype list is common across multiple samples in +computeReadLikelihoods(), we can pass the haplotype bases to the native code once and re-use across +multiple samples. +4. computeLikelihoods(): Copies array references for readBases/quals etc to array of +JNIReadDataHolder objects. Invokes the JNI function to perform the computation and updates the +likelihoodMap. +The JNI function copies the byte array references into an array of testcase structs and invokes the +compute_full_prob function through the function pointers initialized earlier. +The primary native function called is +Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniComputeLikelihoods. It uses +standard JNI calls to get and return data from/to the Java class VectorLoglessPairHMM. The last +argument to the function is the maximum number of OpenMP threads to use while computing PairHMM in +C++. This option is set when the native function call is made from JNILoglessPairHMM +computeLikelihoods - currently it is set to 12 (no logical reason). +Note: OpenMP has been disabled for now - insufficient #testcases per call to computeLikelihoods() to +justify multi-threading. +5. finalizeRegion(): Releases the haplotype arrays initialized in step 3 - should be called at the +end of every region (line 351 in PairHMMLikelihoodCalculationEngine). + +Note: Debug code has been moved to a separate class DebugJNILoglessPairHMM.java. + +Compiling: +Make sure you have icc (Intel C compiler) available. Currently, gcc does not seem to support all AVX +intrinsics. +This native library is called libVectorLoglessPairHMM.so +Using Maven: +Type 'mvn install' in this directory - this will build the library (by invoking 'make') and copy the +native library to the directory +${sting-utils.basedir}/src/main/resources/org/broadinstitute/sting/utils/pairhmm +The GATK maven build process (when run) will bundle the library into the StingUtils jar file from +the copied directory. +Simple build: +cd src/main/c++ +make + +Running: +The default implementation of PairHMM is now VECTOR_LOGLESS_CACHING in HaplotypeCaller.java. To use +the Java version, use the command line argument "--pair_hmm_implementation LOGLESS_CACHING". (see +run.sh in src/main/c++). +The native library is bundled with the StingUtils jar file. When HaplotypeCaller is invoked, then +the library is unpacked from the jar file, copied to the /tmp directory (with a unique id) and +loaded by the Java class VectorLoglessPairHMM in the constructor (if it has not been loaded +already). +The default library can be overridden by using the -Djava.library.path argument (see +src/main/c++/run.sh for an example) for the JVM to pass the path to the library. If the library +libVectorLoglessPairHMM.so can be found in java.library.path, then it is loaded and the 'packed' +library is not used. diff --git a/public/VectorPairHMM/pom.xml b/public/VectorPairHMM/pom.xml new file mode 100644 index 000000000..41bb73211 --- /dev/null +++ b/public/VectorPairHMM/pom.xml @@ -0,0 +1,119 @@ + + + 4.0.0 + + + org.broadinstitute.sting + sting-root + 2.8-SNAPSHOT + ../../public/sting-root + + + VectorPairHMM + pom + Vectorized PairHMM native libraries + + Builds a GNU/Linux x86_64 library of VectorPairHMM using icc (Intel C++ compiler). During install, copies it into sting-utils. Neither tested nor expected to work on any other platform. + + + UTF-8 + ${sourceEncoding} + ${sourceEncoding} + ${project.basedir}/../.. + ${sting.basedir}/public/sting-utils + + ${sting-utils.basedir}/src/main/resources/org/broadinstitute/sting/utils/pairhmm + + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + + + + display-info + + validate + + + + + + + org.codehaus.mojo + exec-maven-plugin + + + + exec + + compile + + make + src/main/c++ + + ${java.home} + ${project.build.directory} + + + + + + + + + org.apache.maven.plugins + maven-install-plugin + + true + + + + + + org.apache.maven.plugins + maven-resources-plugin + + + default-install + + copy-resources + + install + + ${pairhmm.resources.directory} + + + ${project.build.directory} + + **/* + + + + + + + + + + + com.google.code.sortpom + maven-sortpom-plugin + + false + custom_1 + \n + ${sourceEncoding} + true + scope + 4 + false + + + + + diff --git a/public/VectorPairHMM/src/main/c++/.gitignore b/public/VectorPairHMM/src/main/c++/.gitignore new file mode 100644 index 000000000..d791ffd80 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/.gitignore @@ -0,0 +1,16 @@ +.svn +*.o +*.so +tests +.deps +hmm_Mohammad +pairhmm-template-main +*.swp +*.class +checker +reformat +subdir_checkout.sh +avx/ +sse/ +triplicate.sh + diff --git a/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc b/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc new file mode 100644 index 000000000..0e3026f65 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/LoadTimeInitializer.cc @@ -0,0 +1,206 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "LoadTimeInitializer.h" +#include "utils.h" +using namespace std; +char* LoadTimeInitializerStatsNames[] = +{ + "num_regions", + "num_reads", + "num_haplotypes", + "num_testcases", + "num_double_invocations", + "haplotype_length", + "readlength", + "product_read_length_haplotype_length", + "dummy" +}; + +LoadTimeInitializer g_load_time_initializer; + +LoadTimeInitializer::LoadTimeInitializer() //will be called when library is loaded +{ + ConvertChar::init(); +#ifndef DISABLE_FTZ + //Very important to get good performance on Intel processors + //Function: enabling FTZ converts denormals to 0 in hardware + //Denormals cause microcode to insert uops into the core causing big slowdown + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + cout << "FTZ enabled - may decrease accuracy if denormal numbers encountered\n"; +#else + cout << "FTZ is not set - may slow down performance if denormal numbers encountered\n"; +#endif + //Profiling: times for compute and transfer (either bytes copied or pointers copied) + m_compute_time = 0; + m_data_transfer_time = 0; + m_bytes_copied = 0; + + //Initialize profiling counters + for(unsigned i=0;i::initializeStaticMembers(); + Context::initializeStaticMembers(); + + cout.flush(); +} + +void LoadTimeInitializer::print_profiling() +{ + double mean = 0; + double variance = 0; + uint64_t denominator = 1; + cout << "Time spent in compute_testcases "< C++) "<::iterator mI = m_filename_to_fptr.find(filename); + ofstream* fptr = 0; + if(mI == m_filename_to_fptr.end()) + { + m_filename_to_fptr[filename] = new ofstream(); + fptr = m_filename_to_fptr[filename]; + //File never seen before + if(m_written_files_set.find(filename) == m_written_files_set.end()) + { + to_append = false; + m_written_files_set.insert(filename); + } + fptr->open(filename.c_str(), to_append ? ios::app : ios::out); + assert(fptr->is_open()); + } + else + fptr = (*mI).second; + //ofstream fptr; + //fptr.open(filename.c_str(), to_append ? ofstream::app : ofstream::out); + (*fptr) << s; + if(add_newline) + (*fptr) << "\n"; + //fptr.close(); +} +void LoadTimeInitializer::debug_close() +{ + for(map::iterator mB = m_filename_to_fptr.begin(), mE = m_filename_to_fptr.end(); + mB != mE;mB++) + { + (*mB).second->close(); + delete (*mB).second; + } + m_filename_to_fptr.clear(); +} + +void LoadTimeInitializer::dump_sandbox(testcase& tc, unsigned tc_idx, unsigned numReads, unsigned numHaplotypes) +{ + unsigned haplotypeLength = tc.haplen; + unsigned readLength = tc.rslen; + ofstream& dumpFptr = m_sandbox_fptr; + for(unsigned k=0;k +#include "template.h" + +enum LoadTimeInitializerStatsEnum +{ + NUM_REGIONS_IDX=0, + NUM_READS_IDX, + NUM_HAPLOTYPES_IDX, + NUM_TESTCASES_IDX, + NUM_DOUBLE_INVOCATIONS_IDX, + HAPLOTYPE_LENGTH_IDX, + READ_LENGTH_IDX, + PRODUCT_READ_LENGTH_HAPLOTYPE_LENGTH_IDX, + TOTAL_NUMBER_STATS +}; +extern char* LoadTimeInitializerStatsNames[]; + +class LoadTimeInitializer +{ + public: + LoadTimeInitializer(); //will be called when library is loaded + ~LoadTimeInitializer() + { + delete m_buffer; + } + void print_profiling(); + void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); + void debug_close(); + + void dump_sandbox(testcase& tc, unsigned tc_idx, unsigned numReads, unsigned numHaplotypes); + void open_sandbox() { m_sandbox_fptr.open("sandbox.txt", std::ios::app); } + void close_sandbox() { m_sandbox_fptr.close(); } + + jfieldID m_readBasesFID; + jfieldID m_readQualsFID; + jfieldID m_insertionGOPFID; + jfieldID m_deletionGOPFID; + jfieldID m_overallGCPFID; + jfieldID m_haplotypeBasesFID; + //profiling - update stats + void update_stat(LoadTimeInitializerStatsEnum stat_idx, uint64_t value); + //timing in nanoseconds + uint64_t m_compute_time; + uint64_t m_data_transfer_time; + //bytes copied + uint64_t m_bytes_copied; + unsigned get_buffer_size() { return m_buffer_size; } + char* get_buffer() { return (char*)m_buffer; } + private: + std::map m_filename_to_fptr; + std::set m_written_files_set; + std::ofstream m_sandbox_fptr; + //used to compute various stats + uint64_t m_sum_stats[TOTAL_NUMBER_STATS]; + double m_sum_square_stats[TOTAL_NUMBER_STATS]; + uint64_t m_min_stats[TOTAL_NUMBER_STATS]; + uint64_t m_max_stats[TOTAL_NUMBER_STATS]; + unsigned m_buffer_size; + uint64_t* m_buffer; +}; +extern LoadTimeInitializer g_load_time_initializer; + +#define SIZE_PER_TESTCASE 6*10000 +#define SIZE_PER_BUFFER 10000 + +#endif diff --git a/public/VectorPairHMM/src/main/c++/Makefile b/public/VectorPairHMM/src/main/c++/Makefile new file mode 100644 index 000000000..354bca0bb --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/Makefile @@ -0,0 +1,114 @@ +#Copyright (c) 2012 The Broad Institute + +#Permission is hereby granted, free of charge, to any person +#obtaining a copy of this software and associated documentation +#files (the "Software"), to deal in the Software without +#restriction, including without limitation the rights to use, +#copy, modify, merge, publish, distribute, sublicense, and/or sell +#copies of the Software, and to permit persons to whom the +#Software is furnished to do so, subject to the following +#conditions: + +#The above copyright notice and this permission notice shall be +#included in all copies or substantial portions of the Software. + +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +#EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +#OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +#NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +#WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +#FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +#THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + + +#OMPCFLAGS=-fopenmp +#OMPLFLAGS=-fopenmp #-openmp-link static + +#CFLAGS=-O2 -std=c++11 -W -Wall -march=corei7-avx -Wa,-q -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas +#CFLAGS=-O2 -W -Wall -march=corei7 -mfpmath=sse -msse4.2 -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas + +JRE_HOME?=/opt/jdk1.7.0_25/jre +JNI_COMPILATION_FLAGS=-D_REENTRANT -fPIC -I${JRE_HOME}/../include -I${JRE_HOME}/../include/linux + +COMMON_COMPILATION_FLAGS=$(JNI_COMPILATION_FLAGS) -O3 -W -Wall -pedantic $(OMPCFLAGS) -Wno-unknown-pragmas +CC=icc +CXX=icc + +LDFLAGS=-lm -lrt $(OMPLDFLAGS) +ifdef DISABLE_FTZ + COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz +endif + +PAPI_DIR=/home/karthikg/softwares/papi-5.3.0 +ifdef USE_PAPI + ifeq ($(USE_PAPI),1) + COMMON_COMPILATION_FLAGS+=-I$(PAPI_DIR)/include -DUSE_PAPI + LDFLAGS+=-L$(PAPI_DIR)/lib -lpapi + endif +endif + +ifdef DISABLE_FTZ + COMMON_COMPILATION_FLAGS+=-DDISABLE_FTZ -no-ftz +endif + +BIN=libVectorLoglessPairHMM.so pairhmm-template-main checker +#BIN=checker + +DEPDIR=.deps +DF=$(DEPDIR)/$(*).d + +#Common across libJNI and sandbox +COMMON_SOURCES=utils.cc avx_function_instantiations.cc baseline.cc sse_function_instantiations.cc LoadTimeInitializer.cc +#Part of libJNI +LIBSOURCES=org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc Sandbox.cc $(COMMON_SOURCES) +SOURCES=$(LIBSOURCES) pairhmm-template-main.cc pairhmm-1-base.cc +LIBOBJECTS=$(LIBSOURCES:.cc=.o) +COMMON_OBJECTS=$(COMMON_SOURCES:.cc=.o) + + +#No vectorization for these files +NO_VECTOR_SOURCES=org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc pairhmm-template-main.cc pairhmm-1-base.cc utils.cc baseline.cc LoadTimeInitializer.cc Sandbox.cc +#Use -xAVX for these files +AVX_SOURCES=avx_function_instantiations.cc +#Use -xSSE4.2 for these files +SSE_SOURCES=sse_function_instantiations.cc + +NO_VECTOR_OBJECTS=$(NO_VECTOR_SOURCES:.cc=.o) +AVX_OBJECTS=$(AVX_SOURCES:.cc=.o) +SSE_OBJECTS=$(SSE_SOURCES:.cc=.o) +$(NO_VECTOR_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) +$(AVX_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xAVX +$(SSE_OBJECTS): CXXFLAGS=$(COMMON_COMPILATION_FLAGS) -xSSE4.2 +OBJECTS=$(NO_VECTOR_OBJECTS) $(AVX_OBJECTS) $(SSE_OBJECTS) + +all: $(BIN) Sandbox.class copied_lib + +-include $(addprefix $(DEPDIR)/,$(SOURCES:.cc=.d)) + +checker: pairhmm-1-base.o $(COMMON_OBJECTS) + $(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS) + +pairhmm-template-main: pairhmm-template-main.o $(COMMON_OBJECTS) + $(CXX) $(OMPLFLAGS) -o $@ $^ $(LDFLAGS) + +libVectorLoglessPairHMM.so: $(LIBOBJECTS) + $(CXX) $(OMPLFLAGS) -shared -static-intel -o $@ $(LIBOBJECTS) ${LDFLAGS} + + +$(OBJECTS): %.o: %.cc + @mkdir -p $(DEPDIR) + $(CXX) -c -MMD -MF $(DF) $(CXXFLAGS) $(OUTPUT_OPTION) $< + +Sandbox.class: Sandbox.java + javac Sandbox.java + +copied_lib: libVectorLoglessPairHMM.so +ifdef OUTPUT_DIR + mkdir -p $(OUTPUT_DIR) + rsync -a libVectorLoglessPairHMM.so $(OUTPUT_DIR)/ +endif + +clean: + rm -rf $(BIN) *.o $(DEPDIR) *.class diff --git a/public/VectorPairHMM/src/main/c++/Sandbox.cc b/public/VectorPairHMM/src/main/c++/Sandbox.cc new file mode 100644 index 000000000..985b19ae9 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/Sandbox.cc @@ -0,0 +1,106 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "Sandbox.h" +#include "org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h" +#include "utils.h" +#include "jni_common.h" +/* + * Class: Sandbox + * Method: jniGetMachineType + * Signature: ()J + */ +JNIEXPORT jlong JNICALL Java_Sandbox_jniGetMachineType + (JNIEnv * env, jobject thisObj) +{ + return 0; +} + +/* + * Class: Sandbox + * Method: jniInitializeClassFieldsAndMachineMask + * Signature: (Ljava/lang/Class;Ljava/lang/Class;J)V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniInitializeClassFieldsAndMachineMask + (JNIEnv* env, jobject thisObject, jclass readDataHolderClass, jclass haplotypeDataHolderClass, jlong mask) +{ + Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeClassFieldsAndMachineMask(env, thisObject, readDataHolderClass, + haplotypeDataHolderClass, mask); +} + +/* + * Class: Sandbox + * Method: jniInitializeHaplotypes + * Signature: (I[LSandbox/JNIHaplotypeDataHolderClass;)V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniInitializeHaplotypes + (JNIEnv * env, jobject thisObject, jint numHaplotypes, jobjectArray haplotypeDataArray) +{ + Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeHaplotypes(env, thisObject, numHaplotypes, haplotypeDataArray); +} + +/* + * Class: Sandbox + * Method: jniFinalizeRegion + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniFinalizeRegion + (JNIEnv * env, jobject thisObject) +{ + Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniFinalizeRegion(env, thisObject); +} + + +/* + * Class: Sandbox + * Method: jniComputeLikelihoods + * Signature: (II[LSandbox/JNIReadDataHolderClass;[LSandbox/JNIHaplotypeDataHolderClass;[DI)V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniComputeLikelihoods + (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, + jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse) +{ + Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniComputeLikelihoods(env, thisObject, + numReads, numHaplotypes, readDataArray, haplotypeDataArray, likelihoodArray, maxNumThreadsToUse); +} +/* + * Class: Sandbox + * Method: jniClose + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniClose + (JNIEnv* env, jobject thisObject) +{ Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniClose(env, thisObject); } + +JNIEXPORT void JNICALL Java_Sandbox_doEverythingNative + (JNIEnv* env, jobject thisObject, jstring fileNameString) +{ + const char* fileName = env->GetStringUTFChars(fileNameString, 0); + char local_array[800]; + strncpy(local_array, fileName, 200); + env->ReleaseStringUTFChars(fileNameString, fileName); + do_compute(local_array, true, 10000, false); +} + diff --git a/public/VectorPairHMM/src/main/c++/Sandbox.h b/public/VectorPairHMM/src/main/c++/Sandbox.h new file mode 100644 index 000000000..486a1c095 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/Sandbox.h @@ -0,0 +1,96 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class Sandbox */ + +#ifndef _Included_Sandbox +#define _Included_Sandbox +#ifdef __cplusplus +extern "C" { +#endif +#undef Sandbox_enableAll +#define Sandbox_enableAll -1LL +/* + * Class: Sandbox + * Method: jniGetMachineType + * Signature: ()J + */ +JNIEXPORT jlong JNICALL Java_Sandbox_jniGetMachineType + (JNIEnv *, jobject); + +/* + * Class: Sandbox + * Method: jniInitializeClassFieldsAndMachineMask + * Signature: (Ljava/lang/Class;Ljava/lang/Class;J)V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniInitializeClassFieldsAndMachineMask + (JNIEnv *, jobject, jclass, jclass, jlong); + +/* + * Class: Sandbox + * Method: jniInitializeHaplotypes + * Signature: (I[LSandbox/JNIHaplotypeDataHolderClass;)V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniInitializeHaplotypes + (JNIEnv *, jobject, jint, jobjectArray); + +/* + * Class: Sandbox + * Method: jniFinalizeRegion + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniFinalizeRegion + (JNIEnv *, jobject); + +/* + * Class: Sandbox + * Method: jniComputeLikelihoods + * Signature: (II[LSandbox/JNIReadDataHolderClass;[LSandbox/JNIHaplotypeDataHolderClass;[DI)V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniComputeLikelihoods + (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint); + +/* + * Class: Sandbox + * Method: jniClose + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_Sandbox_jniClose + (JNIEnv *, jobject); + +/* + * Class: Sandbox + * Method: doEverythingNative + * Signature: ([B)V + */ +JNIEXPORT void JNICALL Java_Sandbox_doEverythingNative + (JNIEnv *, jobject, jstring); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/public/VectorPairHMM/src/main/c++/Sandbox.java b/public/VectorPairHMM/src/main/c++/Sandbox.java new file mode 100644 index 000000000..d6b7c2eae --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/Sandbox.java @@ -0,0 +1,305 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.utils.vectorpairhmm; + +import java.util.List; +import java.util.LinkedList; +import java.util.Map; +import java.util.HashMap; +import java.io.File; +import java.util.Scanner; +import java.io.IOException; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; + +public class Sandbox { + + private long setupTime = 0; + private long computeTime = 0; + //Used to copy references to byteArrays to JNI from reads + protected class JNIReadDataHolderClass { + public byte[] readBases = null; + public byte[] readQuals = null; + public byte[] insertionGOP = null; + public byte[] deletionGOP = null; + public byte[] overallGCP = null; + } + + //Used to copy references to byteArrays to JNI from haplotypes + protected class JNIHaplotypeDataHolderClass { + public byte[] haplotypeBases = null; + } + + /** + * Return 64-bit mask representing machine capabilities + * Bit 0 is LSB, bit 63 MSB + * Bit 0 represents sse4.2 availability + * Bit 1 represents AVX availability + */ + public native long jniGetMachineType(); + public static final long enableAll = 0xFFFFFFFFFFFFFFFFl; + + + /** + * Function to initialize the fields of JNIReadDataHolderClass and JNIHaplotypeDataHolderClass from JVM. + * C++ codegets FieldIDs for these classes once and re-uses these IDs for the remainder of the program. Field IDs do not + * change per JVM session + * @param readDataHolderClass class type of JNIReadDataHolderClass + * @param haplotypeDataHolderClass class type of JNIHaplotypeDataHolderClass + * @param mask mask is a 64 bit integer identical to the one received from jniGetMachineType(). Users can disable usage of some hardware features by zeroing some bits in the mask + * */ + private native void jniInitializeClassFieldsAndMachineMask(Class readDataHolderClass, Class haplotypeDataHolderClass, long mask); + + private static Boolean isVectorLoglessPairHMMLibraryLoaded = false; + //The constructor is called only once inside PairHMMLikelihoodCalculationEngine + public Sandbox() { + synchronized(isVectorLoglessPairHMMLibraryLoaded) { + //Load the library and initialize the FieldIDs + if(!isVectorLoglessPairHMMLibraryLoaded) { + System.loadLibrary("VectorLoglessPairHMM"); + isVectorLoglessPairHMMLibraryLoaded = true; + jniInitializeClassFieldsAndMachineMask(JNIReadDataHolderClass.class, JNIHaplotypeDataHolderClass.class, enableAll); //need to do this only once + } + } + } + + private native void jniInitializeHaplotypes(final int numHaplotypes, JNIHaplotypeDataHolderClass[] haplotypeDataArray); + + //Used to transfer data to JNI + //Since the haplotypes are the same for all calls to computeLikelihoods within a region, transfer the haplotypes only once to the JNI per region + public void initialize(final List haplotypes) { + int numHaplotypes = haplotypes.size(); + JNIHaplotypeDataHolderClass[] haplotypeDataArray = new JNIHaplotypeDataHolderClass[numHaplotypes]; + int idx = 0; + for(final JNIHaplotypeDataHolderClass currHaplotype : haplotypes) + { + haplotypeDataArray[idx] = new JNIHaplotypeDataHolderClass(); + haplotypeDataArray[idx].haplotypeBases = currHaplotype.haplotypeBases; + ++idx; + } + jniInitializeHaplotypes(numHaplotypes, haplotypeDataArray); + } + /** + * Tell JNI to release arrays - really important if native code is directly accessing Java memory, if not + * accessing Java memory directly, still important to release memory from C++ + */ + private native void jniFinalizeRegion(); + + + public void finalizeRegion() + { + jniFinalizeRegion(); + } + + /** + * Real compute kernel + */ + private native void jniComputeLikelihoods(int numReads, int numHaplotypes, JNIReadDataHolderClass[] readDataArray, + JNIHaplotypeDataHolderClass[] haplotypeDataArray, double[] likelihoodArray, int maxNumThreadsToUse); + + public void computeLikelihoods(final List reads, final List haplotypes) { + //System.out.println("Region : "+reads.size()+" x "+haplotypes.size()); + long startTime = System.nanoTime(); + int readListSize = reads.size(); + int numHaplotypes = haplotypes.size(); + int numTestcases = readListSize*numHaplotypes; + JNIReadDataHolderClass[] readDataArray = new JNIReadDataHolderClass[readListSize]; + int idx = 0; + for(JNIReadDataHolderClass read : reads) + { + readDataArray[idx] = new JNIReadDataHolderClass(); + readDataArray[idx].readBases = read.readBases; + readDataArray[idx].readQuals = read.readQuals; + readDataArray[idx].insertionGOP = read.insertionGOP; + readDataArray[idx].deletionGOP = read.deletionGOP; + readDataArray[idx].overallGCP = read.overallGCP; + ++idx; + } + + double[] mLikelihoodArray = new double[readListSize*numHaplotypes]; //to store results + setupTime += (System.nanoTime() - startTime); + //for(reads) + // for(haplotypes) + // compute_full_prob() + jniComputeLikelihoods(readListSize, numHaplotypes, readDataArray, null, mLikelihoodArray, 12); + + computeTime += (System.nanoTime() - startTime); + } + + /** + * Print final profiling information from native code + */ + public native void jniClose(); + public void close() + { + System.out.println("Time spent in setup for JNI call : "+(setupTime*1e-9)+" compute time : "+(computeTime*1e-9)); + jniClose(); + } + + public void parseSandboxFile(String filename) + { + File file = new File(filename); + Scanner input = null; + try + { + input = new Scanner(file); + } + catch(FileNotFoundException e) + { + System.err.println("File "+filename+" cannot be found/read"); + return; + } + int idx = 0; + int numReads = 0; + int numHaplotypes = 0; + int readIdx = 0, testCaseIdx = 0, haplotypeIdx = 0; + LinkedList haplotypeList = new LinkedList(); + LinkedList readList = new LinkedList(); + + byte[][] byteArray = new byte[6][]; + boolean firstLine = true; + String[] currTokens = new String[8]; + while(input.hasNextLine()) + { + String line = input.nextLine(); + Scanner lineScanner = new Scanner(line); + idx = 0; + while(lineScanner.hasNext()) + currTokens[idx++] = lineScanner.next(); + if(idx == 0) + break; + assert(idx >= 6); + //start of new region + if(idx == 8) + { + if(!firstLine) + { + initialize(haplotypeList); + computeLikelihoods(readList, haplotypeList); + finalizeRegion(); + } + try + { + numReads = Integer.parseInt(currTokens[6]); + } + catch(NumberFormatException e) + { + numReads = 1; + } + try + { + numHaplotypes = Integer.parseInt(currTokens[7]); + } + catch(NumberFormatException e) + { + numHaplotypes = 1; + } + haplotypeIdx = readIdx = testCaseIdx = 0; + readList.clear(); + haplotypeList.clear(); + } + if(haplotypeIdx < numHaplotypes) + { + JNIHaplotypeDataHolderClass X = new JNIHaplotypeDataHolderClass(); + X.haplotypeBases = currTokens[0].getBytes(); + haplotypeList.add(X); + } + if(testCaseIdx%numHaplotypes == 0) + { + JNIReadDataHolderClass X = new JNIReadDataHolderClass(); + X.readBases = currTokens[1].getBytes(); + for(int i=2;i<6;++i) + { + byteArray[i] = currTokens[i].getBytes(); + for(int j=0;j 0 && readList.size() > 0) + { + initialize(haplotypeList); + computeLikelihoods(readList, haplotypeList); + finalizeRegion(); + } + + close(); + input.close(); + } + + private native void doEverythingNative(String filename); + + public static void main(String[] args) + { + if(args.length <= 0) + { + System.err.println("Needs 1 argument - "); + System.exit(-1); + } + //// Get runtime + //java.lang.Runtime rt = java.lang.Runtime.getRuntime(); + //// Start a new process: UNIX command ls + //String cmd = "/home/karthikg/broad/gsa-unstable/public/c++/VectorPairHMM/checker "+args[0]; + //try + //{ + //System.out.println(cmd); + //java.lang.Process p = rt.exec(cmd); + //try + //{ + //p.waitFor(); + //java.io.InputStream is = p.getInputStream(); + //java.io.BufferedReader reader = new java.io.BufferedReader(new InputStreamReader(is)); + //// And print each line + //String s = null; + //while ((s = reader.readLine()) != null) { + //System.out.println(s); + //} + //is.close(); + //} + //catch(InterruptedException e) + //{ + //System.err.println(e); + //} + //} + //catch(IOException e) + //{ + //System.err.println(e); + //} + Sandbox t = new Sandbox(); + //t.doEverythingNative(args[0]); + t.parseSandboxFile(args[0]); + } +} diff --git a/public/VectorPairHMM/src/main/c++/Sandbox_JNIHaplotypeDataHolderClass.h b/public/VectorPairHMM/src/main/c++/Sandbox_JNIHaplotypeDataHolderClass.h new file mode 100644 index 000000000..7f78f0178 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/Sandbox_JNIHaplotypeDataHolderClass.h @@ -0,0 +1,13 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class Sandbox_JNIHaplotypeDataHolderClass */ + +#ifndef _Included_Sandbox_JNIHaplotypeDataHolderClass +#define _Included_Sandbox_JNIHaplotypeDataHolderClass +#ifdef __cplusplus +extern "C" { +#endif +#ifdef __cplusplus +} +#endif +#endif diff --git a/public/VectorPairHMM/src/main/c++/Sandbox_JNIReadDataHolderClass.h b/public/VectorPairHMM/src/main/c++/Sandbox_JNIReadDataHolderClass.h new file mode 100644 index 000000000..a9312ff3b --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/Sandbox_JNIReadDataHolderClass.h @@ -0,0 +1,13 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class Sandbox_JNIReadDataHolderClass */ + +#ifndef _Included_Sandbox_JNIReadDataHolderClass +#define _Included_Sandbox_JNIReadDataHolderClass +#ifdef __cplusplus +extern "C" { +#endif +#ifdef __cplusplus +} +#endif +#endif diff --git a/public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc b/public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc new file mode 100644 index 000000000..6d90d5070 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/avx_function_instantiations.cc @@ -0,0 +1,44 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "template.h" + +#undef SIMD_ENGINE +#undef SIMD_ENGINE_SSE + +#define SIMD_ENGINE avx +#define SIMD_ENGINE_AVX + +#include "define-float.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +#include "define-double.h" +#include "shift_template.c" +#include "pairhmm-template-kernel.cc" + +template double compute_full_prob_avxd(testcase* tc, double* nextlog); +template float compute_full_prob_avxs(testcase* tc, float* nextlog); + diff --git a/public/VectorPairHMM/src/main/c++/baseline.cc b/public/VectorPairHMM/src/main/c++/baseline.cc new file mode 100644 index 000000000..d6085e661 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/baseline.cc @@ -0,0 +1,167 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "headers.h" +#include "template.h" +#include "utils.h" +#include "LoadTimeInitializer.h" +using namespace std; + +template +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log) +{ + int r, c; + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + + Context ctx; + //#define USE_STACK_ALLOCATION 1 +#ifdef USE_STACK_ALLOCATION + NUMBER M[ROWS][COLS]; + NUMBER X[ROWS][COLS]; + NUMBER Y[ROWS][COLS]; + NUMBER p[ROWS][6]; +#else + //allocate on heap in way that simulates a 2D array. Having a 2D array instead of + //a straightforward array of pointers ensures that all data lies 'close' in memory, increasing + //the chance of being stored together in the cache. Also, prefetchers can learn memory access + //patterns for 2D arrays, not possible for array of pointers + //bool locally_allocated = false; + //NUMBER* common_buffer = 0; + NUMBER* common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6]; + //unsigned curr_size = sizeof(NUMBER)*(3*ROWS*COLS + ROWS*6); + //if(true) + //{ + //common_buffer = new NUMBER[3*ROWS*COLS + ROWS*6]; + //locally_allocated = true; + //} + //else + //common_buffer = (NUMBER*)(g_load_time_initializer.get_buffer()); + //pointers to within the allocated buffer + NUMBER** common_pointer_buffer = new NUMBER*[4*ROWS]; + NUMBER* ptr = common_buffer; + unsigned i = 0; + for(i=0;i<3*ROWS;++i, ptr+=COLS) + common_pointer_buffer[i] = ptr; + for(;i<4*ROWS;++i, ptr+=6) + common_pointer_buffer[i] = ptr; + + NUMBER** M = common_pointer_buffer; + NUMBER** X = M + ROWS; + NUMBER** Y = X + ROWS; + NUMBER** p = Y + ROWS; +#endif + + + p[0][MM] = ctx._(0.0); + p[0][GapM] = ctx._(0.0); + p[0][MX] = ctx._(0.0); + p[0][XX] = ctx._(0.0); + p[0][MY] = ctx._(0.0); + p[0][YY] = ctx._(0.0); + + for (r = 1; r < ROWS; r++) + { + int _i = tc->i[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + //p[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + SET_MATCH_TO_MATCH_PROB(p[r][MM], _i, _d); + p[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; + p[r][MX] = ctx.ph2pr[_i]; + p[r][XX] = ctx.ph2pr[_c]; + p[r][MY] = ctx.ph2pr[_d]; + p[r][YY] = ctx.ph2pr[_c]; + //p[r][MY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_d]; + //p[r][YY] = (r == ROWS - 1) ? ctx._(1.0) : ctx.ph2pr[_c]; + } + for (c = 0; c < COLS; c++) + { + M[0][c] = ctx._(0.0); + X[0][c] = ctx._(0.0); + Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen); + } + + for (r = 1; r < ROWS; r++) + { + M[r][0] = ctx._(0.0); + X[r][0] = X[r-1][0] * p[r][XX]; + Y[r][0] = ctx._(0.0); + } + + NUMBER result = ctx._(0.0); + + for (r = 1; r < ROWS; r++) + for (c = 1; c < COLS; c++) + { + fexcept_t flagp; + char _rs = tc->rs[r-1]; + char _hap = tc->hap[c-1]; + int _q = tc->q[r-1] & 127; + NUMBER distm = ctx.ph2pr[_q]; + if (_rs == _hap || _rs == 'N' || _hap == 'N') + distm = ctx._(1.0) - distm; + else + distm = distm/3; + + + //feclearexcept(FE_ALL_EXCEPT); + M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); + //STORE_FP_EXCEPTIONS(flagp, exceptions_array); + + //feclearexcept(FE_ALL_EXCEPT); + X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; + //STORE_FP_EXCEPTIONS(flagp, exceptions_array); + + //feclearexcept(FE_ALL_EXCEPT); + Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; + //STORE_FP_EXCEPTIONS(flagp, exceptions_array); + + //CONVERT_AND_PRINT(M[r][c]); + //CONVERT_AND_PRINT(X[r][c]); + //CONVERT_AND_PRINT(Y[r][c]); + + } + for (c = 0; c < COLS; c++) + { + result += M[ROWS-1][c] + X[ROWS-1][c]; + } + + if (before_last_log != NULL) + *before_last_log = result; + +#ifndef USE_STACK_ALLOCATION + delete common_pointer_buffer; + //if(locally_allocated) + delete common_buffer; +#endif + + return result; + //return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; +} + +template double compute_full_prob(testcase* tc, double* nextbuf); +template float compute_full_prob(testcase* tc, float* nextbuf); + diff --git a/public/VectorPairHMM/src/main/c++/define-double.h b/public/VectorPairHMM/src/main/c++/define-double.h new file mode 100644 index 000000000..2067d369c --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/define-double.h @@ -0,0 +1,205 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include + +#ifdef PRECISION +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES + +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC +#endif + +#define PRECISION d +#define MAIN_TYPE double +#define MAIN_TYPE_SIZE 64 +#define UNION_TYPE mix_D +#define IF_128 IF_128d +#define IF_MAIN_TYPE IF_64 +#define SHIFT_CONST1 8 +#define SHIFT_CONST2 1 +#define SHIFT_CONST3 8 +#define _128_TYPE __m128d +#define SIMD_TYPE __m256d +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 4 +#define HAP_TYPE __m128i +#define MASK_TYPE uint64_t +#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFF +#define MASK_VEC MaskVec_D + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_pd() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_pd(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_pd(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_pd(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_pd(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm256_div_pd(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_pd(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_pd(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castpd256_pd128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_pd (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_pd(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_sd(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castpd128_pd256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_pd(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_pd(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_pd(_mm_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_pd(_mm_load_si128((__m128i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_pd(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_pd(zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castpd128_pd256(__vsLow) ; \ +__vdst = _mm256_insertf128_pd(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi64(__vs, 1) + + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + double* ptr1 = (double*) (&__v1) ; \ + double* ptr2 = (double*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Double Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ +} + +class BitMaskVec_double { + + MASK_VEC low_, high_ ; + SIMD_TYPE combined_ ; + + public: + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } + + inline const SIMD_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_double diff --git a/public/VectorPairHMM/src/main/c++/define-float.h b/public/VectorPairHMM/src/main/c++/define-float.h new file mode 100644 index 000000000..318f78280 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/define-float.h @@ -0,0 +1,206 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include + +#ifdef PRECISION +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES + +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC +#endif + +#define PRECISION s + +#define MAIN_TYPE float +#define MAIN_TYPE_SIZE 32 +#define UNION_TYPE mix_F +#define IF_128 IF_128f +#define IF_MAIN_TYPE IF_32 +#define SHIFT_CONST1 12 +#define SHIFT_CONST2 3 +#define SHIFT_CONST3 4 +#define _128_TYPE __m128 +#define SIMD_TYPE __m256 +#define _256_INT_TYPE __m256i +#define AVX_LENGTH 8 +#define HAP_TYPE UNION_TYPE +#define MASK_TYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF +#define MASK_VEC MaskVec_F + +#define SET_VEC_ZERO(__vec) \ + __vec= _mm256_setzero_ps() + +#define VEC_OR(__v1, __v2) \ + _mm256_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm256_add_ps(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm256_sub_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm256_mul_ps(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm256_div_ps(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm256_blend_ps(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm256_blendv_ps(__v1, __v2, __maskV) + +#define VEC_CAST_256_128(__v1) \ + _mm256_castps256_ps128 (__v1) + +#define VEC_EXTRACT_128(__v1, __im) \ + _mm256_extractf128_ps (__v1, __im) + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) + +#define VEC_SET1_VAL128(__val) \ + _mm_set1_ps(__val) + +#define VEC_MOVE(__v1, __val) \ + _mm_move_ss(__v1, __val) + +#define VEC_CAST_128_256(__v1) \ + _mm256_castps128_ps256(__v1) + +#define VEC_INSERT_VAL(__v1, __val, __pos) \ + _mm256_insertf128_ps(__v1, __val, __pos) + +#define VEC_CVT_128_256(__v1) \ + _mm256_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm256_set1_ps(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm256_cvtepi32_ps(_mm256_set1_epi32(__ch)) + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm256_cvtepi32_ps(_mm256_loadu_si256((__m256i const *)__addr)) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm256_cmp_ps(__v1, __v2, _CMP_EQ_OQ) + +#define VEC_SET_LSE(__val) \ + _mm256_set_ps(zero, zero, zero, zero, zero, zero, zero, __val); + +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lastavxs(__v1, __val.f); + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm256_castps128_ps256(__vsLow) ; \ +__vdst = _mm256_insertf128_ps(__vdst, __vsHigh, 1) ; + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + +#define COMPARE_VECS(__v1, __v2, __first, __last) { \ + float* ptr1 = (float*) (&__v1) ; \ + float* ptr2 = (float*) (&__v2) ; \ + for (int ei=__first; ei <= __last; ++ei) { \ + if (ptr1[ei] != ptr2[ei]) { \ + std::cout << "Float Mismatch at " << ei << ": " \ + << ptr1[ei] << " vs. " << ptr2[ei] << std::endl ; \ + exit(0) ; \ + } \ + } \ +} + +class BitMaskVec_float { + + MASK_VEC low_, high_ ; + SIMD_TYPE combined_ ; + + public: + + inline MASK_TYPE& getLowEntry(int index) { + return low_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return high_.masks[index] ; + } + + inline const SIMD_TYPE& getCombinedMask() { + VEC_SSE_TO_AVX(low_.vecf, high_.vecf, combined_) ; + return combined_ ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(low_.vec) ; + VEC_SHIFT_LEFT_1BIT(high_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_float diff --git a/public/VectorPairHMM/src/main/c++/define-sse-double.h b/public/VectorPairHMM/src/main/c++/define-sse-double.h new file mode 100644 index 000000000..2d271a854 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/define-sse-double.h @@ -0,0 +1,173 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#ifdef PRECISION +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES + +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_INSERT_UNIT(__v1,__ins,__im) +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC +#endif + +#define SSE +#define PRECISION d + +#define MAIN_TYPE double +#define MAIN_TYPE_SIZE 64 +#define UNION_TYPE mix_D128 +#define IF_128 IF_128d +#define IF_MAIN_TYPE IF_64 +#define SHIFT_CONST1 1 +#define SHIFT_CONST2 8 +#define SHIFT_CONST3 0 +#define _128_TYPE __m128d +#define SIMD_TYPE __m128d +#define _256_INT_TYPE __m128i +#define AVX_LENGTH 2 +#define HAP_TYPE __m128i +#define MASK_TYPE uint64_t +#define MASK_ALL_ONES 0xFFFFFFFFFFFFFFFFL +#define MASK_VEC MaskVec_D + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi64(__v1, __im) + +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi64(__v1,__ins,__im) + +#define VEC_OR(__v1, __v2) \ + _mm_or_pd(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm_add_pd(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm_sub_pd(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm_mul_pd(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm_div_pd(__v1, __v2) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm_cmpeq_pd(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm_blend_pd(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm_blendv_pd(__v1, __v2, __maskV) + +#define SHIFT_HAP(__v1, __val) \ + __v1 = _mm_insert_epi32(_mm_slli_si128(__v1, 4), __val.i, 0) + +#define VEC_CVT_128_256(__v1) \ + _mm_cvtepi32_pd(__v1) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_pd(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm_cvtepi32_pd(_mm_set1_epi32(__ch)) + +#define VEC_SET_LSE(__val) \ + _mm_set_pd(zero, __val); + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm_cvtepi32_pd(_mm_loadu_si128((__m128i const *)__addr)) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm_castsi128_pd(_mm_set_epi64(__vsHigh, __vsLow)) + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi64(__vs, 1) + + +class BitMaskVec_sse_double { + + MASK_VEC combined_ ; + public: + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } + + inline const SIMD_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_sse_double + diff --git a/public/VectorPairHMM/src/main/c++/define-sse-float.h b/public/VectorPairHMM/src/main/c++/define-sse-float.h new file mode 100644 index 000000000..20af947dd --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/define-sse-float.h @@ -0,0 +1,173 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#ifdef PRECISION +#undef PRECISION +#undef MAIN_TYPE +#undef MAIN_TYPE_SIZE +#undef UNION_TYPE +#undef IF_128 +#undef IF_MAIN_TYPE +#undef SHIFT_CONST1 +#undef SHIFT_CONST2 +#undef SHIFT_CONST3 +#undef _128_TYPE +#undef SIMD_TYPE +#undef AVX_LENGTH +#undef HAP_TYPE +#undef MASK_TYPE +#undef MASK_ALL_ONES + +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_INSERT_UNIT(__v1,__ins,__im) +#undef SET_VEC_ZERO(__vec) +#undef VEC_OR(__v1, __v2) +#undef VEC_ADD(__v1, __v2) +#undef VEC_SUB(__v1, __v2) +#undef VEC_MUL(__v1, __v2) +#undef VEC_DIV(__v1, __v2) +#undef VEC_BLEND(__v1, __v2, __mask) +#undef VEC_BLENDV(__v1, __v2, __maskV) +#undef VEC_CAST_256_128(__v1) +#undef VEC_EXTRACT_128(__v1, __im) +#undef VEC_EXTRACT_UNIT(__v1, __im) +#undef VEC_SET1_VAL128(__val) +#undef VEC_MOVE(__v1, __val) +#undef VEC_CAST_128_256(__v1) +#undef VEC_INSERT_VAL(__v1, __val, __pos) +#undef VEC_CVT_128_256(__v1) +#undef VEC_SET1_VAL(__val) +#undef VEC_POPCVT_CHAR(__ch) +#undef VEC_LDPOPCVT_CHAR(__addr) +#undef VEC_CMP_EQ(__v1, __v2) +#undef VEC_SET_LSE(__val) +#undef SHIFT_HAP(__v1, __val) +#undef MASK_VEC +#undef VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) +#undef VEC_SHIFT_LEFT_1BIT(__vs) +#undef MASK_ALL_ONES +#undef COMPARE_VECS(__v1, __v2) +#undef _256_INT_TYPE +#undef BITMASK_VEC +#endif + +#define SSE +#define PRECISION s + +#define MAIN_TYPE float +#define MAIN_TYPE_SIZE 32 +#define UNION_TYPE mix_F128 +#define IF_128 IF_128f +#define IF_MAIN_TYPE IF_32 +#define SHIFT_CONST1 3 +#define SHIFT_CONST2 4 +#define SHIFT_CONST3 0 +#define _128_TYPE __m128 +#define SIMD_TYPE __m128 +#define _256_INT_TYPE __m128i +#define AVX_LENGTH 4 +//#define MAVX_COUNT (MROWS+3)/AVX_LENGTH +#define HAP_TYPE UNION_TYPE +#define MASK_TYPE uint32_t +#define MASK_ALL_ONES 0xFFFFFFFF +#define MASK_VEC MaskVec_F + +#define VEC_EXTRACT_UNIT(__v1, __im) \ + _mm_extract_epi32(__v1, __im) + +#define VEC_INSERT_UNIT(__v1,__ins,__im) \ + _mm_insert_epi32(__v1,__ins,__im) + +#define VEC_OR(__v1, __v2) \ + _mm_or_ps(__v1, __v2) + +#define VEC_ADD(__v1, __v2) \ + _mm_add_ps(__v1, __v2) + +#define VEC_SUB(__v1, __v2) \ + _mm_sub_ps(__v1, __v2) + +#define VEC_MUL(__v1, __v2) \ + _mm_mul_ps(__v1, __v2) + +#define VEC_DIV(__v1, __v2) \ + _mm_div_ps(__v1, __v2) + +#define VEC_CMP_EQ(__v1, __v2) \ + _mm_cmpeq_ps(__v1, __v2) + +#define VEC_BLEND(__v1, __v2, __mask) \ + _mm_blend_ps(__v1, __v2, __mask) + +#define VEC_BLENDV(__v1, __v2, __maskV) \ + _mm_blendv_ps(__v1, __v2, __maskV) + +#define SHIFT_HAP(__v1, __val) \ + _vector_shift_lastsses(__v1, __val.f) + +#define VEC_CVT_128_256(__v1) \ + _mm_cvtepi32_ps(__v1.i) + +#define VEC_SET1_VAL(__val) \ + _mm_set1_ps(__val) + +#define VEC_POPCVT_CHAR(__ch) \ + _mm_cvtepi32_ps(_mm_set1_epi32(__ch)) + +#define VEC_SET_LSE(__val) \ + _mm_set_ps(zero, zero, zero, __val); + +#define VEC_LDPOPCVT_CHAR(__addr) \ + _mm_cvtepi32_ps(_mm_loadu_si128((__m128i const *)__addr)) + +#define VEC_SSE_TO_AVX(__vsLow, __vsHigh, __vdst) \ + __vdst = _mm_cvtpi32x2_ps(__vsLow, __vsHigh) + +#define VEC_SHIFT_LEFT_1BIT(__vs) \ + __vs = _mm_slli_epi32(__vs, 1) + +class BitMaskVec_sse_float { + + MASK_VEC combined_ ; + + public: + inline MASK_TYPE& getLowEntry(int index) { + return combined_.masks[index] ; + } + inline MASK_TYPE& getHighEntry(int index) { + return combined_.masks[AVX_LENGTH/2+index] ; + } + + inline const SIMD_TYPE& getCombinedMask() { + return combined_.vecf ; + } + + inline void shift_left_1bit() { + VEC_SHIFT_LEFT_1BIT(combined_.vec) ; + } + +} ; + +#define BITMASK_VEC BitMaskVec_sse_float diff --git a/public/VectorPairHMM/src/main/c++/headers.h b/public/VectorPairHMM/src/main/c++/headers.h new file mode 100644 index 000000000..4a0d89b57 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/headers.h @@ -0,0 +1,71 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef COMMON_HEADERS_H +#define COMMON_HEADERS_H + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern uint64_t exceptions_array[128]; +extern FILE* g_debug_fptr; +#define STORE_FP_EXCEPTIONS(flagp, exceptions_array) \ + fegetexceptflag(&flagp, FE_ALL_EXCEPT | __FE_DENORM); \ + exceptions_array[FE_INVALID] += ((flagp & FE_INVALID)); \ + exceptions_array[__FE_DENORM] += ((flagp & __FE_DENORM) >> 1); \ + exceptions_array[FE_DIVBYZERO] += ((flagp & FE_DIVBYZERO) >> 2); \ + exceptions_array[FE_OVERFLOW] += ((flagp & FE_OVERFLOW) >> 3); \ + exceptions_array[FE_UNDERFLOW] += ((flagp & FE_UNDERFLOW) >> 4); \ + feclearexcept(FE_ALL_EXCEPT | __FE_DENORM); + +#define CONVERT_AND_PRINT(X) \ + g_converter.f = (X); \ + fwrite(&(g_converter.i),4,1,g_debug_fptr); \ + +#endif diff --git a/public/VectorPairHMM/src/main/c++/jni_common.h b/public/VectorPairHMM/src/main/c++/jni_common.h new file mode 100644 index 000000000..23c323246 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/jni_common.h @@ -0,0 +1,58 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef JNI_COMMON_H +#define JNI_COMMON_H + +#include +/*#define ENABLE_ASSERTIONS 1*/ +#define DO_PROFILING 1 +/*#define DEBUG 1*/ +/*#define DEBUG0_1 1*/ +/*#define DEBUG3 1*/ +/*#define DUMP_TO_SANDBOX 1*/ + + +#define DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY 1 + +#ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY +//Gets direct access to Java arrays +#define GET_BYTE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical +#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical +#define JNI_RO_RELEASE_MODE JNI_ABORT +#define GET_DOUBLE_ARRAY_ELEMENTS env->GetPrimitiveArrayCritical +#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleasePrimitiveArrayCritical + +#else +//Likely makes copy of Java arrays to JNI C++ space +#define GET_BYTE_ARRAY_ELEMENTS env->GetByteArrayElements +#define RELEASE_BYTE_ARRAY_ELEMENTS env->ReleaseByteArrayElements +#define JNI_RO_RELEASE_MODE JNI_ABORT +#define GET_DOUBLE_ARRAY_ELEMENTS env->GetDoubleArrayElements +#define RELEASE_DOUBLE_ARRAY_ELEMENTS env->ReleaseDoubleArrayElements + +#endif //ifdef DIRECT_ACCESS_TO_JAVA_HEAP_MEMORY + +#endif //ifndef JNI_COMMON_H diff --git a/public/VectorPairHMM/src/main/c++/jnidebug.h b/public/VectorPairHMM/src/main/c++/jnidebug.h new file mode 100644 index 000000000..7fcab2a51 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/jnidebug.h @@ -0,0 +1,191 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef JNI_DEBUG_H +#define JNI_DEBUG_H + +template +class DataHolder +{ +#define INIT_MATRIX(X) \ + X = new NUMBER*[m_paddedMaxReadLength]; \ + for(int i=0;i ctx; + for (int r = 1; r <= length;r++) //in original code, r < ROWS (where ROWS = paddedReadLength) + { + int _i = insertionGOP[r-1]; //insertionGOP + int _d = deletionGOP[r-1]; //deletionGOP + int _c = overallGCP[r-1]; //overallGCP + m_transition[r][MM] = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; //lines 161-162 + m_transition[r][GapM] = ctx._(1.0) - ctx.ph2pr[_c]; //line 163 + m_transition[r][MX] = ctx.ph2pr[_i]; //164 + m_transition[r][XX] = ctx.ph2pr[_c]; //165 + m_transition[r][MY] = ctx.ph2pr[_d];//last row seems different, compared to line 166 + m_transition[r][YY] = ctx.ph2pr[_c];//same as above for line 167 + //m_transition[r][MY] = (r == length) ? ctx._(1.0) : ctx.ph2pr[_d];//last row seems different, compared to line 166 + //m_transition[r][YY] = (r == length) ? ctx._(1.0) : ctx.ph2pr[_c];//same as above for line 167 +#ifdef DEBUG3 + for(int j=0;j<6;++j) + debug_dump("transitions_jni.txt", to_string(m_transition[r][j]),true); +#endif + } + ++g_num_prob_init; + } + bool m_is_initialized; + int m_readMaxLength; + int m_haplotypeMaxLength; + int m_paddedMaxReadLength; + int m_paddedMaxHaplotypeLength; + NUMBER** m_matchMatrix; + NUMBER** m_insertionMatrix; + NUMBER** m_deletionMatrix; + NUMBER** m_prior; + NUMBER (*m_transition)[6]; +}; +extern DataHolder g_double_dataholder; + +template +NUMBER compute_full_prob(testcase *tc, NUMBER** M, NUMBER** X, NUMBER** Y, NUMBER (*p)[6], + bool do_initialization, jint hapStartIndex, NUMBER *before_last_log = NULL) +{ + int r, c; + int ROWS = tc->rslen + 1; //ROWS = paddedReadLength + int COLS = tc->haplen + 1; //COLS = paddedHaplotypeLength + + Context ctx; + //////NOTES + ////ctx.ph2pr[quality]; //This quantity is QualityUtils.qualToErrorProb(quality) + ////1-ctx.ph2pr[quality]; //This corresponds to QualityUtils.qualToProb(quality); + + //Initialization + if(do_initialization) + { + for (c = 0; c < COLS; c++) + { + M[0][c] = ctx._(0.0); + X[0][c] = ctx._(0.0); + Y[0][c] = ctx.INITIAL_CONSTANT / (tc->haplen); //code from 87-90 in LoglessPairHMM + } + + for (r = 1; r < ROWS; r++) + { + M[r][0] = ctx._(0.0); + //deletionMatrix row 0 in above nest is initialized in the Java code + //However, insertionMatrix column 0 is not initialized in Java code, could it be that + //values are re-used from a previous iteration? + //Why even do this, X[0][0] = 0 from above loop nest, X[idx][0] = 0 from this computation + X[r][0] = X[r-1][0] * p[r][XX]; + Y[r][0] = ctx._(0.0); + } + } + + for (r = 1; r < ROWS; r++) + for (c = hapStartIndex+1; c < COLS; c++) + { + //The following lines correspond to initializePriors() + char _rs = tc->rs[r-1]; //line 137 + char _hap = tc->hap[c-1]; //line 140 + //int _q = tc->q[r-1] & 127; //line 138 - q is the quality (qual), should be byte hence int ANDed with 0xFF + int _q = tc->q[r-1]; //line 138 - q is the quality (qual), should be byte hence int ANDed with 0xFF + NUMBER distm = ctx.ph2pr[_q]; //This quantity is QualityUtils.qualToErrorProb(_q) + //The assumption here is that doNotUseTristateCorrection is true + //TOASK + if (_rs == _hap || _rs == 'N' || _hap == 'N') + distm = ctx._(1.0) - distm; //This is the quantity QualityUtils.qualToProb(qual) + else + distm = distm/3; +#ifdef DEBUG3 + debug_dump("priors_jni.txt",to_string(distm),true); +#endif + + //Computation inside updateCell + M[r][c] = distm * (M[r-1][c-1] * p[r][MM] + X[r-1][c-1] * p[r][GapM] + Y[r-1][c-1] * p[r][GapM]); + X[r][c] = M[r-1][c] * p[r][MX] + X[r-1][c] * p[r][XX]; + Y[r][c] = M[r][c-1] * p[r][MY] + Y[r][c-1] * p[r][YY]; +#ifdef DEBUG3 + debug_dump("matrices_jni.txt",to_string(M[r][c]),true); + debug_dump("matrices_jni.txt",to_string(X[r][c]),true); + debug_dump("matrices_jni.txt",to_string(Y[r][c]),true); +#endif + } + + NUMBER result = ctx._(0.0); + for (c = 0; c < COLS; c++) + result += M[ROWS-1][c] + X[ROWS-1][c]; + + if (before_last_log != NULL) + *before_last_log = result; + +#ifdef DEBUG + debug_dump("return_values_jni.txt",to_string(ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT),true); +#endif + return ctx.LOG10(result) - ctx.LOG10_INITIAL_CONSTANT; +} + +#endif diff --git a/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc new file mode 100644 index 000000000..8a3f8b5bc --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.cc @@ -0,0 +1,176 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "headers.h" +#include "jni_common.h" +#include "org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM.h" +#include "template.h" +#include "utils.h" +#include "LoadTimeInitializer.h" +#include "jnidebug.h" +DataHolder g_double_dataholder; + +using namespace std; + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_jniInitialize +(JNIEnv* env, jobject thisObject, + jint readMaxLength, jint haplotypeMaxLength) +{ + static int g_num_init_calls = 0; +#ifdef DEBUG3 + cout << "Entered alloc initialized .. readMaxLength "<GetArrayLength(insertionGOP); +#ifdef DEBUG3 + cout << "Entered initializeProbabilities .. length "<GetByteArrayElements(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (env)->GetByteArrayElements(deletionGOP, &is_copy); + jbyte* overallGCPArray = (env)->GetByteArrayElements(overallGCP, &is_copy); +#ifdef DEBUG + if(insertionGOPArray == 0) + cerr << "insertionGOP array not initialized in JNI\n"; + ////assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + if(deletionGOPArray == 0) + cerr << "deletionGOP array not initialized in JNI\n"; + ////assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "OverallGCP array not initialized in JNI"); +#endif + + g_double_dataholder.initializeProbabilities(length, insertionGOPArray, deletionGOPArray, overallGCPArray); + + env->ReleaseByteArrayElements(overallGCP, overallGCPArray, JNI_ABORT); + env->ReleaseByteArrayElements(deletionGOP, deletionGOPArray, JNI_ABORT); + env->ReleaseByteArrayElements(insertionGOP, insertionGOPArray, JNI_ABORT); +} + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_jniInitializePriorsAndUpdateCells( + JNIEnv* env, jobject thisObject, + jboolean doInitialization, jint paddedReadLength, jint paddedHaplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jint hapStartIndex + ) +{ +#ifdef DEBUG3 + cout << "Entered mainCompute .. doInitialization "<<(doInitialization == JNI_TRUE)<<" hapStartIndex "<GetByteArrayElements(readBases, &is_copy); + jbyte* haplotypeBasesArray = (env)->GetByteArrayElements(haplotypeBases, &is_copy); + jbyte* readQualsArray = (env)->GetByteArrayElements(readQuals, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); +#endif + testcase tc; + + tc.rslen = paddedReadLength-1; + tc.haplen = paddedHaplotypeLength-1; + + tc.rs = (char*)readBasesArray; + tc.hap = (char*)haplotypeBasesArray; + tc.q = (char*)readQualsArray; //TOASK - q is now char* + + compute_full_prob(&tc, g_double_dataholder.m_matchMatrix, g_double_dataholder.m_insertionMatrix, + g_double_dataholder.m_deletionMatrix, g_double_dataholder.m_transition, + doInitialization == JNI_TRUE, hapStartIndex, NULL); + + env->ReleaseByteArrayElements(readBases, readBasesArray, JNI_ABORT); + env->ReleaseByteArrayElements(haplotypeBases, haplotypeBasesArray, JNI_ABORT); + env->ReleaseByteArrayElements(readQuals, readQualsArray, JNI_ABORT); + return 0.0; +} + +JNIEXPORT jdouble JNICALL +Java_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10( + JNIEnv* env, jobject thisObject, + jint readLength, jint haplotypeLength, + jbyteArray readBases, jbyteArray haplotypeBases, jbyteArray readQuals, + jbyteArray insertionGOP, jbyteArray deletionGOP, jbyteArray overallGCP, + jint hapStartIndex + ) +{ + jboolean is_copy = JNI_FALSE; + jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy); + jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBases, &is_copy); + jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy); + jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy); + jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy); +#ifdef DEBUG + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "OverallGCP array not initialized in JNI"); + //assert(readLength < MROWS); +#endif + testcase tc; + tc.rslen = readLength; + tc.haplen = haplotypeLength; + tc.rs = (char*)readBasesArray; + tc.hap = (char*)haplotypeBasesArray; + for(unsigned i=0;i +/* Header for class org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM */ + +#ifndef _Included_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM +#define _Included_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM +#ifdef __cplusplus +extern "C" { +#endif +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_TRISTATE_CORRECTION +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_TRISTATE_CORRECTION 3.0 +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_matchToMatch +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_matchToMatch 0L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_indelToMatch +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_indelToMatch 1L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_matchToInsertion +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_matchToInsertion 2L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_insertionToInsertion +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_insertionToInsertion 3L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_matchToDeletion +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_matchToDeletion 4L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_deletionToDeletion +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_deletionToDeletion 5L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug 0L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_verify +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_verify 0L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug0_1 +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug0_1 0L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug1 +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug1 0L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug2 +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug2 0L +#undef org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug3 +#define org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_debug3 0L +/* + * Class: org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM + * Method: jniInitialize + * Signature: (II)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_jniInitialize + (JNIEnv *, jobject, jint, jint); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM + * Method: jniInitializeProbabilities + * Signature: ([[D[B[B[B)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_jniInitializeProbabilities + (JNIEnv *, jclass, jobjectArray, jbyteArray, jbyteArray, jbyteArray); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM + * Method: jniInitializePriorsAndUpdateCells + * Signature: (ZII[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_jniInitializePriorsAndUpdateCells + (JNIEnv *, jobject, jboolean, jint, jint, jbyteArray, jbyteArray, jbyteArray, jint); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM + * Method: jniSubComputeReadLikelihoodGivenHaplotypeLog10 + * Signature: (II[B[B[B[B[B[BI)D + */ +JNIEXPORT jdouble JNICALL Java_org_broadinstitute_sting_utils_pairhmm_DebugJNILoglessPairHMM_jniSubComputeReadLikelihoodGivenHaplotypeLog10 + (JNIEnv *, jobject, jint, jint, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jbyteArray, jint); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc new file mode 100644 index 000000000..0b54c8a81 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.cc @@ -0,0 +1,382 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "headers.h" +#include "jni_common.h" +#include "org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h" +#include "template.h" +#include "utils.h" +#include "LoadTimeInitializer.h" + +using namespace std; + +JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniGetMachineType + (JNIEnv* env, jobject thisObject) +{ + return (jlong)get_machine_capabilities(); +} + +//Should be called only once for the whole Java process - initializes field ids for the classes JNIReadDataHolderClass +//and JNIHaplotypeDataHolderClass +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeClassFieldsAndMachineMask + (JNIEnv* env, jobject thisObject, jclass readDataHolderClass, jclass haplotypeDataHolderClass, jlong mask) +{ + assert(readDataHolderClass); + assert(haplotypeDataHolderClass); + jfieldID fid; + fid = env->GetFieldID(readDataHolderClass, "readBases", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for readBases"); + g_load_time_initializer.m_readBasesFID = fid; + fid = env->GetFieldID(readDataHolderClass, "readQuals", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for readQuals"); + g_load_time_initializer.m_readQualsFID = fid; + fid = env->GetFieldID(readDataHolderClass, "insertionGOP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for insertionGOP"); + g_load_time_initializer.m_insertionGOPFID = fid; + fid = env->GetFieldID(readDataHolderClass, "deletionGOP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for deletionGOP"); + g_load_time_initializer.m_deletionGOPFID = fid; + fid = env->GetFieldID(readDataHolderClass, "overallGCP", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for overallGCP"); + g_load_time_initializer.m_overallGCPFID = fid; + + fid = env->GetFieldID(haplotypeDataHolderClass, "haplotypeBases", "[B"); + assert(fid && "JNI pairHMM: Could not get FID for haplotypeBases"); + g_load_time_initializer.m_haplotypeBasesFID = fid; + if(mask != ENABLE_ALL_HARDWARE_FEATURES) + { + cout << "Using user supplied hardware mask to re-initialize function pointers for PairHMM\n"; + initialize_function_pointers((uint64_t)mask); + cout.flush(); + } +} + +//Since the list of haplotypes against which the reads are evaluated in PairHMM is the same for a region, +//transfer the list only once +vector > g_haplotypeBasesArrayVector; +vector g_haplotypeBasesLengths; +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeHaplotypes + (JNIEnv * env, jobject thisObject, jint numHaplotypes, jobjectArray haplotypeDataArray) +{ + jboolean is_copy = JNI_FALSE; + //To ensure, GET_BYTE_ARRAY_ELEMENTS is invoked only once for each haplotype, store bytearrays in a vector + vector >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; + haplotypeBasesArrayVector.clear(); + g_haplotypeBasesLengths.clear(); + haplotypeBasesArrayVector.resize(numHaplotypes); + g_haplotypeBasesLengths.resize(numHaplotypes); + jsize haplotypeBasesLength = 0; + for(unsigned j=0;jGetObjectArrayElement(haplotypeDataArray, j); + jbyteArray haplotypeBases = (jbyteArray)env->GetObjectField(haplotypeObject, g_load_time_initializer.m_haplotypeBasesFID); +#ifdef ENABLE_ASSERTIONS + assert(haplotypeBases && ("haplotypeBases is NULL at index : "+to_string(j)+"\n").c_str()); +#endif + //Need a global reference as this will be accessed across multiple JNI calls to JNIComputeLikelihoods() + jbyteArray haplotypeBasesGlobalRef = (jbyteArray)env->NewGlobalRef(haplotypeBases); +#ifdef ENABLE_ASSERTIONS + assert(haplotypeBasesGlobalRef && ("Could not get global ref to haplotypeBases at index : "+to_string(j)+"\n").c_str()); +#endif + env->DeleteLocalRef(haplotypeBases); //free the local reference + jbyte* haplotypeBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(haplotypeBasesGlobalRef, &is_copy); + haplotypeBasesLength = env->GetArrayLength(haplotypeBasesGlobalRef); +#ifdef ENABLE_ASSERTIONS + assert(haplotypeBasesArray && "haplotypeBasesArray not initialized in JNI"); + //assert(haplotypeBasesLength < MCOLS); +#endif +#ifdef DEBUG0_1 + cout << "JNI haplotype length "< > >& readBasesArrayVector, vector& tc_array) +{ + jboolean is_copy = JNI_FALSE; + //haplotype vector from earlier store - note the reference to vector, not copying + vector >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; + unsigned tc_idx = 0; + for(unsigned i=0;iGetObjectArrayElement(readDataArray, i); + jbyteArray readBases = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readBasesFID); + jbyteArray insertionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_insertionGOPFID); + jbyteArray deletionGOP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_deletionGOPFID); + jbyteArray overallGCP = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_overallGCPFID); + jbyteArray readQuals = (jbyteArray)env->GetObjectField(readObject, g_load_time_initializer.m_readQualsFID); + +#ifdef ENABLE_ASSERTIONS + assert(readBases && ("readBases is NULL at index : "+to_string(i)+"\n").c_str()); + assert(insertionGOP && ("insertionGOP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(deletionGOP && ("deletionGOP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(overallGCP && ("overallGCP is NULL at index : "+to_string(i)+"\n").c_str()); + assert(readQuals && ("readQuals is NULL at index : "+to_string(i)+"\n").c_str()); +#endif + jsize readLength = env->GetArrayLength(readBases); + + jbyte* readBasesArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readBases, &is_copy); //order of GET-RELEASE is important + jbyte* readQualsArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(readQuals, &is_copy); + jbyte* insertionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(insertionGOP, &is_copy); + jbyte* deletionGOPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(deletionGOP, &is_copy); + jbyte* overallGCPArray = (jbyte*)GET_BYTE_ARRAY_ELEMENTS(overallGCP, &is_copy); +#ifdef DO_PROFILING + g_load_time_initializer.m_bytes_copied += (is_copy ? readLength*5 : 0); + g_load_time_initializer.update_stat(READ_LENGTH_IDX, readLength); +#endif +#ifdef ENABLE_ASSERTIONS + assert(readBasesArray && "readBasesArray not initialized in JNI"); + assert(readQualsArray && "readQualsArray not initialized in JNI"); + assert(insertionGOPArray && "insertionGOP array not initialized in JNI"); + assert(deletionGOPArray && "deletionGOP array not initialized in JNI"); + assert(overallGCPArray && "overallGCP array not initialized in JNI"); + //assert(readLength < MROWS); + assert(readLength == env->GetArrayLength(readQuals)); + assert(readLength == env->GetArrayLength(insertionGOP)); + assert(readLength == env->GetArrayLength(deletionGOP)); + assert(readLength == env->GetArrayLength(overallGCP)); +#endif +#ifdef DEBUG0_1 + cout << "JNI read length "<& tc_array, unsigned numTestCases, double* likelihoodDoubleArray, + unsigned maxNumThreadsToUse) +{ +#ifdef DO_REPEAT_PROFILING + for(unsigned i=0;i<10;++i) +#endif + { +#pragma omp parallel for schedule (dynamic,10000) num_threads(maxNumThreadsToUse) + for(unsigned tc_idx=0;tc_idx > >& readBasesArrayVector) +{ + //Release read arrays first + for(int i=readBasesArrayVector.size()-1;i>=0;--i)//note the order - reverse of GET + { + for(int j=readBasesArrayVector[i].size()-1;j>=0;--j) + RELEASE_BYTE_ARRAY_ELEMENTS(readBasesArrayVector[i][j].first, readBasesArrayVector[i][j].second, JNI_RO_RELEASE_MODE); + readBasesArrayVector[i].clear(); + } + readBasesArrayVector.clear(); +} + + +#ifdef DO_WARMUP +uint64_t g_sum = 0; +#endif +//JNI function to invoke compute_full_prob_avx +//readDataArray - array of JNIReadDataHolderClass objects which contain the readBases, readQuals etc +//haplotypeDataArray - array of JNIHaplotypeDataHolderClass objects which contain the haplotypeBases +//likelihoodArray - array of doubles to return results back to Java. Memory allocated by Java prior to JNI call +//maxNumThreadsToUse - Max number of threads that OpenMP can use for the HMM computation +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniComputeLikelihoods + (JNIEnv* env, jobject thisObject, jint numReads, jint numHaplotypes, + jobjectArray readDataArray, jobjectArray haplotypeDataArray, jdoubleArray likelihoodArray, jint maxNumThreadsToUse) +{ +#ifdef DEBUG0_1 + cout << "JNI numReads "< tc_array; + tc_array.clear(); + tc_array.resize(numTestCases); + //Store read arrays for release later + vector > > readBasesArrayVector; + readBasesArrayVector.clear(); + readBasesArrayVector.resize(numReads); +#ifdef DUMP_TO_SANDBOX + g_load_time_initializer.open_sandbox(); +#endif +#ifdef DO_PROFILING + get_time(&start_time); +#endif + //Copy byte array references from Java memory into vector of testcase structs + Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeTestcasesVector(env, + numReads, numHaplotypes, readDataArray, readBasesArrayVector, tc_array); +#ifdef DO_PROFILING + g_load_time_initializer.m_data_transfer_time += diff_time(start_time); +#endif + + //Get double array where results are stored (to pass back to java) + jdouble* likelihoodDoubleArray = (jdouble*)GET_DOUBLE_ARRAY_ELEMENTS(likelihoodArray, &is_copy); +#ifdef ENABLE_ASSERTIONS + assert(likelihoodDoubleArray && "likelihoodArray is NULL"); + assert(env->GetArrayLength(likelihoodArray) == numTestCases); +#endif +#ifdef DO_WARMUP //ignore - only for crazy profiling + vector >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; + for(unsigned i=0;iGetArrayLength(haplotypeBasesArrayVector[i].first); + for(unsigned j=0;jGetArrayLength(readBasesArrayVector[i][j].first); + for(unsigned k=0;k >& haplotypeBasesArrayVector = g_haplotypeBasesArrayVector; + //Now release haplotype arrays + for(int j=haplotypeBasesArrayVector.size()-1;j>=0;--j) //note the order - reverse of GET + { + RELEASE_BYTE_ARRAY_ELEMENTS(haplotypeBasesArrayVector[j].first, haplotypeBasesArrayVector[j].second, JNI_RO_RELEASE_MODE); + env->DeleteGlobalRef(haplotypeBasesArrayVector[j].first); //free the global reference + } + haplotypeBasesArrayVector.clear(); + g_haplotypeBasesLengths.clear(); +} + + +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniClose + (JNIEnv* env, jobject thisObject) +{ +#ifdef DO_PROFILING + g_load_time_initializer.print_profiling(); +#endif +#ifdef DEBUG + g_load_time_initializer.debug_close(); +#endif +} + diff --git a/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h new file mode 100644 index 000000000..d820b4b26 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM.h @@ -0,0 +1,104 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include +/* Header for class org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM */ + +#ifndef _Included_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM +#define _Included_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM +#ifdef __cplusplus +extern "C" { +#endif +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_TRISTATE_CORRECTION +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_TRISTATE_CORRECTION 3.0 +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_matchToMatch +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_matchToMatch 0L +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_indelToMatch +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_indelToMatch 1L +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_matchToInsertion +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_matchToInsertion 2L +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_insertionToInsertion +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_insertionToInsertion 3L +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_matchToDeletion +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_matchToDeletion 4L +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_deletionToDeletion +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_deletionToDeletion 5L +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_sse42Mask +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_sse42Mask 1LL +#undef org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_avxMask +#define org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_avxMask 2LL +/* + * Class: org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM + * Method: jniGetMachineType + * Signature: ()J + */ +JNIEXPORT jlong JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniGetMachineType + (JNIEnv *, jobject); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM + * Method: jniClose + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniClose + (JNIEnv *, jobject); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM + * Method: jniInitializeClassFieldsAndMachineMask + * Signature: (Ljava/lang/Class;Ljava/lang/Class;J)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeClassFieldsAndMachineMask + (JNIEnv *, jobject, jclass, jclass, jlong); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM + * Method: jniInitializeHaplotypes + * Signature: (I[Lorg/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM/JNIHaplotypeDataHolderClass;)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniInitializeHaplotypes + (JNIEnv *, jobject, jint, jobjectArray); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM + * Method: jniFinalizeRegion + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniFinalizeRegion + (JNIEnv *, jobject); + +/* + * Class: org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM + * Method: jniComputeLikelihoods + * Signature: (II[Lorg/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM/JNIReadDataHolderClass;[Lorg/broadinstitute/sting/utils/pairhmm/VectorLoglessPairHMM/JNIHaplotypeDataHolderClass;[DI)V + */ +JNIEXPORT void JNICALL Java_org_broadinstitute_sting_utils_pairhmm_VectorLoglessPairHMM_jniComputeLikelihoods + (JNIEnv *, jobject, jint, jint, jobjectArray, jobjectArray, jdoubleArray, jint); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc b/public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc new file mode 100644 index 000000000..7ff219b88 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/pairhmm-1-base.cc @@ -0,0 +1,70 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +//#define DEBUG 1 +//#define DEBUG0_1 1 +//#define DEBUG3 1 +#include "headers.h" +#include "utils.h" +#include "LoadTimeInitializer.h" +using namespace std; + +int main(int argc, char** argv) +{ +#define BATCH_SIZE 10000 + if(argc < 2) + { + cerr << "Needs path to input file as argument\n"; + exit(0); + } + bool use_old_read_testcase = false; + if(argc >= 3 && string(argv[2]) == "1") + use_old_read_testcase = true; + unsigned chunk_size = 10000; + bool do_check = true; + uint64_t mask = ~(0ull); + for(int i=3;i +#include +#include + + +void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]) { + + const int maskBitCnt = MAIN_TYPE_SIZE ; + + for (int vi=0; vi < numMaskVecs; ++vi) { + for (int rs=0; rs < NUM_DISTINCT_CHARS; ++rs) { + maskArr[vi][rs] = 0 ; + } + maskArr[vi][AMBIG_CHAR] = MASK_ALL_ONES ; + } + + for (int col=1; col < COLS; ++col) { + int mIndex = (col-1) / maskBitCnt ; + int mOffset = (col-1) % maskBitCnt ; + MASK_TYPE bitMask = ((MASK_TYPE)0x1) << (maskBitCnt-1-mOffset) ; + + char hapChar = ConvertChar::get(tc.hap[col-1]); + + if (hapChar == AMBIG_CHAR) { + for (int ci=0; ci < NUM_DISTINCT_CHARS; ++ci) + maskArr[mIndex][ci] |= bitMask ; + } + + maskArr[mIndex][hapChar] |= bitMask ; + // bit corresponding to col 1 will be the MSB of the mask 0 + // bit corresponding to col 2 will be the MSB-1 of the mask 0 + // ... + // bit corresponding to col 32 will be the LSB of the mask 0 + // bit corresponding to col 33 will be the MSB of the mask 1 + // ... + } + +} + +void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess) { + + for (int ri=0; ri < numRowsToProcess; ++ri) { + rsArr[ri] = ConvertChar::get(tc.rs[ri+beginRowIndex-1]) ; + } + + for (int ei=0; ei < AVX_LENGTH; ++ei) { + lastMaskShiftOut[ei] = 0 ; + } +} + +#define SET_MASK_WORD(__dstMask, __srcMask, __lastShiftOut, __shiftBy, __maskBitCnt){ \ + MASK_TYPE __bitMask = (((MASK_TYPE)0x1) << __shiftBy) - 1 ; \ + MASK_TYPE __nextShiftOut = (__srcMask & __bitMask) << (__maskBitCnt - __shiftBy) ; \ + __dstMask = (__srcMask >> __shiftBy) | __lastShiftOut ; \ + __lastShiftOut = __nextShiftOut ; \ +} + + +void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, BITMASK_VEC& bitMaskVec, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, int maskBitCnt) { + + for (int ei=0; ei < AVX_LENGTH/2; ++ei) { + SET_MASK_WORD(bitMaskVec.getLowEntry(ei), maskArr[maskIndex][rsArr[ei]], + lastMaskShiftOut[ei], ei, maskBitCnt) ; + + int ei2 = ei + AVX_LENGTH/2 ; // the second entry index + SET_MASK_WORD(bitMaskVec.getHighEntry(ei), maskArr[maskIndex][rsArr[ei2]], + lastMaskShiftOut[ei2], ei2, maskBitCnt) ; + } + +} + + +inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (BITMASK_VEC& bitMaskVec, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen) { + + distmChosen = VEC_BLENDV(distm, _1_distm, bitMaskVec.getCombinedMask()) ; + + bitMaskVec.shift_left_1bit() ; +} + +/* + * This function: + * 1- Intializes probability values p_MM, p_XX, P_YY, p_MX, p_GAPM and pack them into vectors (SSE or AVX) + * 2- Precompute parts of "distm" which only depeneds on a row number and pack it into vector + */ + +template void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D) +{ + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + for (int s=0;si[r-1] & 127; + int _d = tc->d[r-1] & 127; + int _c = tc->c[r-1] & 127; + + //*(ptr_p_MM+r-1) = ctx._(1.0) - ctx.ph2pr[(_i + _d) & 127]; + SET_MATCH_TO_MATCH_PROB(*(ptr_p_MM+r-1), _i, _d); + *(ptr_p_GAPM+r-1) = ctx._(1.0) - ctx.ph2pr[_c]; + *(ptr_p_MX+r-1) = ctx.ph2pr[_i]; + *(ptr_p_XX+r-1) = ctx.ph2pr[_c]; + *(ptr_p_MY+r-1) = ctx.ph2pr[_d]; + *(ptr_p_YY+r-1) = ctx.ph2pr[_c]; + } + + NUMBER *ptr_distm1D = (NUMBER *)distm1D; + for (int r = 1; r < ROWS; r++) + { + int _q = tc->q[r-1] & 127; + ptr_distm1D[r-1] = ctx.ph2pr[_q]; + } +} + +/* + * This function handles pre-stripe computation: + * 1- Retrieve probaility vectors from memory + * 2- Initialize M, X, Y vectors with all 0's (for the first stripe) and shifting the last row from previous stripe for the rest + */ + +template inline void CONCAT(CONCAT(stripeINITIALIZATION,SIMD_ENGINE), PRECISION)( + int stripeIdx, Context ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY, + SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM , + SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM) +{ + int i = stripeIdx; + pGAPM = p_GAPM[i]; + pMM = p_MM[i]; + pMX = p_MX[i]; + pXX = p_XX[i]; + pMY = p_MY[i]; + pYY = p_YY[i]; + + NUMBER zero = ctx._(0.0); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + UNION_TYPE packed3; packed3.d = VEC_SET1_VAL(3.0); + + distm = distm1D[i]; + _1_distm = VEC_SUB(packed1.d, distm); + + distm = VEC_DIV(distm, packed3.d); + + /* initialize M_t_2, M_t_1, X_t_2, X_t_1, Y_t_2, Y_t_1 */ + M_t_2.d = VEC_SET1_VAL(zero); + X_t_2.d = VEC_SET1_VAL(zero); + + if (i==0) { + M_t_1.d = VEC_SET1_VAL(zero); + X_t_1.d = VEC_SET1_VAL(zero); + Y_t_2.d = VEC_SET_LSE(init_Y); + Y_t_1.d = VEC_SET1_VAL(zero); + } + else { + X_t_1.d = VEC_SET_LSE(shiftOutX[AVX_LENGTH]); + M_t_1.d = VEC_SET_LSE(shiftOutM[AVX_LENGTH]); + Y_t_2.d = VEC_SET1_VAL(zero); + Y_t_1.d = VEC_SET1_VAL(zero); + } + M_t_1_y = M_t_1; +} + +/* + * This function is the main compute kernel to compute M, X and Y + */ + +inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, + UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, + SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel) +{ + /* Compute M_t <= distm * (p_MM*M_t_2 + p_GAPM*X_t_2 + p_GAPM*Y_t_2) */ + M_t.d = VEC_MUL(VEC_ADD(VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(X_t_2.d, pGAPM)), VEC_MUL(Y_t_2.d, pGAPM)), distmSel); + //M_t.d = VEC_MUL( VEC_ADD(VEC_MUL(M_t_2.d, pMM), VEC_MUL(VEC_ADD(X_t_2.d, Y_t_2.d), pGAPM)), distmSel); + + M_t_y = M_t; + + /* Compute X_t */ + X_t.d = VEC_ADD(VEC_MUL(M_t_1.d, pMX) , VEC_MUL(X_t_1.d, pXX)); + + /* Compute Y_t */ + Y_t.d = VEC_ADD(VEC_MUL(M_t_1_y.d, pMY) , VEC_MUL(Y_t_1.d, pYY)); +} + +/* + * This is the main compute function. It operates on the matrix in s stripe manner. + * The stripe height is determined by the SIMD engine type. + * Stripe height: "AVX float": 8, "AVX double": 4, "SSE float": 4, "SSE double": 2 + * For each stripe the operations are anti-diagonal based. + * Each anti-diagonal (M_t, Y_t, X_t) depends on the two previous anti-diagonals (M_t_2, X_t_2, Y_t_2, M_t_1, X_t_1, Y_t_1). + * Each stripe (except the fist one) depends on the last row of the previous stripe. + * The last stripe computation handles the addition of the last row of M and X, that's the reason for loop spliting. + */ + +template NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL) +{ + int ROWS = tc->rslen + 1; + int COLS = tc->haplen + 1; + int MAVX_COUNT = (ROWS+AVX_LENGTH-1)/AVX_LENGTH; + + /* Probaility arrays */ + SIMD_TYPE p_MM [MAVX_COUNT], p_GAPM [MAVX_COUNT], p_MX [MAVX_COUNT]; + SIMD_TYPE p_XX [MAVX_COUNT], p_MY [MAVX_COUNT], p_YY [MAVX_COUNT]; + + /* For distm precomputation */ + SIMD_TYPE distm1D[MAVX_COUNT]; + + /* Carries the values from each stripe to the next stripe */ + NUMBER shiftOutM[ROWS+COLS+AVX_LENGTH], shiftOutX[ROWS+COLS+AVX_LENGTH], shiftOutY[ROWS+COLS+AVX_LENGTH]; + + /* The vector to keep the anti-diagonals of M, X, Y*/ + /* Current: M_t, X_t, Y_t */ + /* Previous: M_t_1, X_t_1, Y_t_1 */ + /* Previous to previous: M_t_2, X_t_2, Y_t_2 */ + UNION_TYPE M_t, M_t_1, M_t_2, X_t, X_t_1, X_t_2, Y_t, Y_t_1, Y_t_2, M_t_y, M_t_1_y; + + /* Probality vectors */ + SIMD_TYPE pGAPM, pMM, pMX, pXX, pMY, pYY; + + struct timeval start, end; + NUMBER result_avx2; + Context ctx; + UNION_TYPE rs , rsN; + HAP_TYPE hap; + SIMD_TYPE distmSel, distmChosen ; + SIMD_TYPE distm, _1_distm; + + int r, c; + NUMBER zero = ctx._(0.0); + UNION_TYPE packed1; packed1.d = VEC_SET1_VAL(1.0); + SIMD_TYPE N_packed256 = VEC_POPCVT_CHAR('N'); + NUMBER init_Y = ctx.INITIAL_CONSTANT / (tc->haplen); + int remainingRows = (ROWS-1) % AVX_LENGTH; + int stripe_cnt = ((ROWS-1) / AVX_LENGTH) + (remainingRows!=0); + + const int maskBitCnt = MAIN_TYPE_SIZE ; + const int numMaskVecs = (COLS+ROWS+maskBitCnt-1)/maskBitCnt ; // ceil function + + /* Mask precomputation for distm*/ + MASK_TYPE maskArr[numMaskVecs][NUM_DISTINCT_CHARS] ; + CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(*tc, COLS, numMaskVecs, maskArr) ; + + char rsArr[AVX_LENGTH] ; + MASK_TYPE lastMaskShiftOut[AVX_LENGTH] ; + + /* Precompute initialization for probabilities and shift vector*/ + CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(ROWS, COLS, shiftOutM, shiftOutX, shiftOutY, + ctx, tc, p_MM, p_GAPM, p_MX, p_XX, p_MY, p_YY, distm1D); + + for (int i=0;i(&tc[b]); + +#ifdef RUN_HYBRID +#define MIN_ACCEPTED 1e-28f + if (result_avxf < MIN_ACCEPTED) { + count++; + result_avxd = CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), d)(&tc[b]); + result[b] = log10(result_avxd) - log10(ldexp(1.0, 1020.f)); + } + else + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif + +#ifndef RUN_HYBRID + result[b] = log10f(result_avxf) - log10f(ldexpf(1.f, 120.f)); +#endif + } + aggregateTimeCompute += (getCurrClk() - lastClk) ; + lastClk = getCurrClk() ; + for (int b=0;b(testcase* tc, double* nextlog); +template float compute_full_prob_sses(testcase* tc, float* nextlog); diff --git a/public/VectorPairHMM/src/main/c++/template.h b/public/VectorPairHMM/src/main/c++/template.h new file mode 100644 index 000000000..ce4dbfc86 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/template.h @@ -0,0 +1,320 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef TEMPLATES_H_ +#define TEMPLATES_H_ + +#include "headers.h" + +#define MM 0 +#define GapM 1 +#define MX 2 +#define XX 3 +#define MY 4 +#define YY 5 + +//#define MROWS 500 +//#define MCOLS 1000 + +#define CAT(X,Y) X####Y +#define CONCAT(X,Y) CAT(X,Y) + +#define ALIGNED __attribute__((aligned(32))) + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256 ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED float ALIGNED f[8]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_F ALIGNED; + +typedef union __attribute__((aligned(32))) { + ALIGNED __m128 ALIGNED d; + ALIGNED __m64 ALIGNED s[2]; + ALIGNED float ALIGNED f[4]; + ALIGNED __m128i ALIGNED i; +} ALIGNED mix_F128 ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128 vecf ; + uint32_t masks[4] ; +} MaskVec_F ; + +typedef union ALIGNED { + __m64 vec ; + __m64 vecf ; + uint32_t masks[2] ; +} MaskVec_F128 ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128 ALIGNED f; +} ALIGNED IF_128f ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int ALIGNED i; + ALIGNED float ALIGNED f; +} ALIGNED IF_32 ALIGNED; + +typedef union __attribute__((aligned(32))) { + ALIGNED __m256d ALIGNED d; + ALIGNED __m128i ALIGNED s[2]; + ALIGNED double ALIGNED f[4]; + ALIGNED __m256i ALIGNED i; +} ALIGNED mix_D ALIGNED; + +typedef union __attribute__((aligned(32))) { + ALIGNED __m128d ALIGNED d; + ALIGNED __m64 ALIGNED s[2]; + ALIGNED double ALIGNED f[2]; + ALIGNED __m128i ALIGNED i; +} ALIGNED mix_D128 ALIGNED; + +typedef union ALIGNED { + __m128i vec ; + __m128d vecf ; + uint64_t masks[2] ; +} MaskVec_D ; + +typedef union ALIGNED { + __m64 vec ; + __m64 vecf ; + uint64_t masks[1] ; +} MaskVec_D128 ; + +typedef union ALIGNED +{ + ALIGNED __m128i ALIGNED i; + ALIGNED __m128d ALIGNED f; +} ALIGNED IF_128d ALIGNED; + +typedef union ALIGNED +{ + ALIGNED int64_t ALIGNED i; + ALIGNED double ALIGNED f; +} ALIGNED IF_64 ALIGNED; + + +#define MAX_QUAL 254 +#define MAX_JACOBIAN_TOLERANCE 8.0 +#define JACOBIAN_LOG_TABLE_STEP 0.0001 +#define JACOBIAN_LOG_TABLE_INV_STEP (1.0 / JACOBIAN_LOG_TABLE_STEP) +#define MAXN 70000 +#define LOG10_CACHE_SIZE (4*MAXN) // we need to be able to go up to 2*(2N) when calculating some of the coefficients +#define JACOBIAN_LOG_TABLE_SIZE ((int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1) + +template +struct ContextBase +{ + public: + NUMBER ph2pr[128]; + NUMBER INITIAL_CONSTANT; + NUMBER LOG10_INITIAL_CONSTANT; + NUMBER RESULT_THRESHOLD; + + static bool staticMembersInitializedFlag; + static NUMBER jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE]; + static NUMBER matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1]; + + static void initializeStaticMembers() + { + if(!staticMembersInitializedFlag) + { + //Order of calls important - Jacobian first, then MatchToMatch + initializeJacobianLogTable(); + initializeMatchToMatchProb(); + staticMembersInitializedFlag = true; + } + } + + static void deleteStaticMembers() + { + if(staticMembersInitializedFlag) + { + staticMembersInitializedFlag = false; + } + } + + //Called only once during library load - don't bother to optimize with single precision fp + static void initializeJacobianLogTable() + { + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = (NUMBER)(log10(1.0 + pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP))); + } + } + + //Called only once per library load - don't bother optimizing with single fp + static void initializeMatchToMatchProb() + { + double LN10 = log(10); + double INV_LN10 = 1.0/LN10; + for (int i = 0, offset = 0; i <= MAX_QUAL; offset += ++i) + for (int j = 0; j <= i; j++) { + double log10Sum = approximateLog10SumLog10(-0.1*i, -0.1*j); + double matchToMatchLog10 = + log1p(-std::min(1.0,pow(10,log10Sum))) * INV_LN10; + matchToMatchProb[offset + j] = (NUMBER)(pow(10,matchToMatchLog10)); + } + } + //Called during computation - use single precision where possible + static int fastRound(NUMBER d) { + return (d > ((NUMBER)0.0)) ? (int) (d + ((NUMBER)0.5)) : (int) (d - ((NUMBER)0.5)); + } + //Called during computation - use single precision where possible + static NUMBER approximateLog10SumLog10(NUMBER small, NUMBER big) { + // make sure small is really the smaller value + if (small > big) { + NUMBER t = big; + big = small; + small = t; + } + + if (isinf(small) == -1 || isinf(big) == -1) + return big; + + NUMBER diff = big - small; + if (diff >= ((NUMBER)MAX_JACOBIAN_TOLERANCE)) + return big; + + // OK, so |y-x| < tol: we use the following identity then: + // we need to compute log10(10^x + 10^y) + // By Jacobian logarithm identity, this is equal to + // max(x,y) + log10(1+10^-abs(x-y)) + // we compute the second term as a table lookup with integer quantization + // we have pre-stored correction for 0,0.1,0.2,... 10.0 + int ind = fastRound((NUMBER)(diff * ((NUMBER)JACOBIAN_LOG_TABLE_INV_STEP))); // hard rounding + return big + jacobianLogTable[ind]; + } +}; + +template +struct Context : public ContextBase +{}; + +template<> +struct Context : public ContextBase +{ + Context():ContextBase() + { + for (int x = 0; x < 128; x++) + ph2pr[x] = pow(10.0, -((double)x) / 10.0); + + INITIAL_CONSTANT = ldexp(1.0, 1020.0); + LOG10_INITIAL_CONSTANT = log10(INITIAL_CONSTANT); + RESULT_THRESHOLD = 0.0; + } + + double LOG10(double v){ return log10(v); } + inline double POW(double b, double e) { return pow(b,e); } + + static double _(double n){ return n; } + static double _(float n){ return ((double) n); } +}; + +template<> +struct Context : public ContextBase +{ + Context() : ContextBase() + { + for (int x = 0; x < 128; x++) + { + ph2pr[x] = powf(10.f, -((float)x) / 10.f); + } + + INITIAL_CONSTANT = ldexpf(1.f, 120.f); + LOG10_INITIAL_CONSTANT = log10f(INITIAL_CONSTANT); + RESULT_THRESHOLD = ldexpf(1.f, -110.f); + } + + float LOG10(float v){ return log10f(v); } + inline float POW(float b, float e) { return powf(b,e); } + + static float _(double n){ return ((float) n); } + static float _(float n){ return n; } +}; + +#define SET_MATCH_TO_MATCH_PROB(output, insQual, delQual) \ +{ \ + int minQual = delQual; \ + int maxQual = insQual; \ + if (insQual <= delQual) \ + { \ + minQual = insQual; \ + maxQual = delQual; \ + } \ + (output) = (MAX_QUAL < maxQual) ? \ + ((NUMBER)1.0) - ctx.POW(((NUMBER)10), ctx.approximateLog10SumLog10(((NUMBER)-0.1)*minQual, ((NUMBER)-0.1)*maxQual)) \ + : ctx.matchToMatchProb[((maxQual * (maxQual + 1)) >> 1) + minQual]; \ +} + +typedef struct +{ + int rslen, haplen; + /*int *q, *i, *d, *c;*/ + /*int q[MROWS], i[MROWS], d[MROWS], c[MROWS];*/ + char *q, *i, *d, *c; + char *hap, *rs; + int *ihap; + int *irs; +} testcase; + +int normalize(char c); +int read_testcase(testcase *tc, FILE* ifp=0); + + +#define MIN_ACCEPTED 1e-28f +#define NUM_DISTINCT_CHARS 5 +#define AMBIG_CHAR 4 + +class ConvertChar { + + static uint8_t conversionTable[255] ; + +public: + + static void init() { + assert (NUM_DISTINCT_CHARS == 5) ; + assert (AMBIG_CHAR == 4) ; + + conversionTable['A'] = 0 ; + conversionTable['C'] = 1 ; + conversionTable['T'] = 2 ; + conversionTable['G'] = 3 ; + conversionTable['N'] = 4 ; + } + + static inline uint8_t get(uint8_t input) { + return conversionTable[input] ; + } + +}; + + +#endif + + diff --git a/public/VectorPairHMM/src/main/c++/utils.cc b/public/VectorPairHMM/src/main/c++/utils.cc new file mode 100644 index 000000000..9f83cffa2 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/utils.cc @@ -0,0 +1,493 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "headers.h" +#include "template.h" +#include "utils.h" +#include "vector_defs.h" +#include "LoadTimeInitializer.h" +using namespace std; + +//static members from ConvertChar +uint8_t ConvertChar::conversionTable[255]; +//Global function pointers in utils.h +float (*g_compute_full_prob_float)(testcase *tc, float* before_last_log) = 0; +double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log) = 0; +//Static members in ContextBase +bool ContextBase::staticMembersInitializedFlag = false; +double ContextBase::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE]; +double ContextBase::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1]; +bool ContextBase::staticMembersInitializedFlag = false; +float ContextBase::jacobianLogTable[JACOBIAN_LOG_TABLE_SIZE]; +float ContextBase::matchToMatchProb[((MAX_QUAL + 1) * (MAX_QUAL + 2)) >> 1]; + + +bool is_avx_supported() +{ + int ecx = 0, edx = 0, ebx = 0; + __asm__("cpuid" + : "=b" (ebx), + "=c" (ecx), + "=d" (edx) + : "a" (1) + ); + return ((ecx >> 28)&1) == 1; +} + +bool is_sse41_supported() +{ + int ecx = 0, edx = 0, ebx = 0; + __asm__("cpuid" + : "=b" (ebx), + "=c" (ecx), + "=d" (edx) + : "a" (1) + ); + return ((ecx >> 19)&1) == 1; +} + +bool is_sse42_supported() +{ + int ecx = 0, edx = 0, ebx = 0; + __asm__("cpuid" + : "=b" (ebx), + "=c" (ecx), + "=d" (edx) + : "a" (1) + ); + return ((ecx >> 20)&1) == 1; +} + +uint64_t get_machine_capabilities() +{ + uint64_t machine_mask = 0ull; + if(is_avx_supported()) + machine_mask |= (1 << AVX_CUSTOM_IDX); + if(is_sse42_supported()) + machine_mask |= (1 << SSE42_CUSTOM_IDX); + if(is_sse41_supported()) + machine_mask |= (1 << SSE41_CUSTOM_IDX); + return machine_mask; +} + +void initialize_function_pointers(uint64_t mask) +{ + //mask = 0ull; + //mask = (1 << SSE41_CUSTOM_IDX); + if(is_avx_supported() && (mask & (1<< AVX_CUSTOM_IDX))) + { + cout << "Using AVX accelerated implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob_avxs; + g_compute_full_prob_double = compute_full_prob_avxd; + } + else + if(is_sse41_supported() && (mask & ((1<< SSE41_CUSTOM_IDX) | (1<; + g_compute_full_prob_double = compute_full_prob_ssed; + } + else + { + cout << "Using un-vectorized C++ implementation of PairHMM\n"; + g_compute_full_prob_float = compute_full_prob; + g_compute_full_prob_double = compute_full_prob; + } +} + +int normalize(char c) +{ + return ((int) (c - 33)); +} + +int read_testcase(testcase *tc, FILE* ifp) +{ + char *q, *i, *d, *c, *line = NULL; + int _q, _i, _d, _c; + int x, size = 0; + ssize_t read; + + + read = getline(&line, (size_t *) &size, ifp == 0 ? stdin : ifp); + if (read == -1) + { + free(line); + return -1; + } + + + tc->hap = (char *) malloc(size); + tc->rs = (char *) malloc(size); + q = (char *) malloc(size); + i = (char *) malloc(size); + d = (char *) malloc(size); + c = (char *) malloc(size); + + if (sscanf(line, "%s %s %s %s %s %s\n", tc->hap, tc->rs, q, i, d, c) != 6) + return -1; + + + tc->haplen = strlen(tc->hap); + tc->rslen = strlen(tc->rs); + assert(strlen(q) == tc->rslen); + assert(strlen(i) == tc->rslen); + assert(strlen(d) == tc->rslen); + assert(strlen(c) == tc->rslen); + //assert(tc->rslen < MROWS); + //tc->ihap = (int *) malloc(tc->haplen*sizeof(int)); + //tc->irs = (int *) malloc(tc->rslen*sizeof(int)); + + tc->q = (char *) malloc(sizeof(char) * tc->rslen); + tc->i = (char *) malloc(sizeof(char) * tc->rslen); + tc->d = (char *) malloc(sizeof(char) * tc->rslen); + tc->c = (char *) malloc(sizeof(char) * tc->rslen); + + for (x = 0; x < tc->rslen; x++) + { + _q = normalize(q[x]); + _i = normalize(i[x]); + _d = normalize(d[x]); + _c = normalize(c[x]); + tc->q[x] = (_q < 6) ? 6 : _q; + //tc->q[x] = _q; + tc->i[x] = _i; + tc->d[x] = _d; + tc->c[x] = _c; + //tc->irs[x] = tc->rs[x]; + } + //for (x = 0; x < tc->haplen; x++) + //tc->ihap[x] = tc->hap[x]; + + free(q); + free(i); + free(d); + free(c); + free(line); + + + + return 0; +} + +unsigned MAX_LINE_LENGTH = 65536; +int convToInt(std::string s) +{ + int i; + std::istringstream strin(s); + strin >> i; + return i; +} + +void tokenize(std::ifstream& fptr, std::vector& tokens) +{ + int i = 0; + std::string tmp; + std::vector myVec; + vector line; + line.clear(); + line.resize(MAX_LINE_LENGTH); + vector tmpline; + tmpline.clear(); + tmpline.resize(MAX_LINE_LENGTH); + myVec.clear(); + + while(!fptr.eof()) + { + i = 0; + bool still_read_line = true; + unsigned line_position = 0; + while(still_read_line) + { + fptr.getline(&(tmpline[0]), MAX_LINE_LENGTH); + if(line_position + MAX_LINE_LENGTH > line.size()) + line.resize(2*line.size()); + for(unsigned i=0;i> std::skipws >> tmp; + if(tmp != "") + { + myVec.push_back(tmp); + ++i; + //std::cout < 0) + break; + } + tokens.clear(); + //std::cout << "Why "< tokens; + tokens.clear(); + tokenize(fptr, tokens); + if(tokens.size() == 0) + return -1; + tc->hap = new char[tokens[0].size()+2]; + tc->haplen = tokens[0].size(); + memcpy(tc->hap, tokens[0].c_str(), tokens[0].size()); + tc->rs = new char[tokens[1].size()+2]; + tc->rslen = tokens[1].size(); + tc->q = new char[tc->rslen]; + tc->i = new char[tc->rslen]; + tc->d = new char[tc->rslen]; + tc->c = new char[tc->rslen]; + //cout << "Lengths "<haplen <<" "<rslen<<"\n"; + memcpy(tc->rs, tokens[1].c_str(),tokens[1].size()); + assert(tokens.size() == 2 + 4*(tc->rslen)); + //assert(tc->rslen < MROWS); + for(unsigned j=0;jrslen;++j) + tc->q[j] = (char)convToInt(tokens[2+0*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->i[j] = (char)convToInt(tokens[2+1*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->d[j] = (char)convToInt(tokens[2+2*tc->rslen+j]); + for(unsigned j=0;jrslen;++j) + tc->c[j] = (char)convToInt(tokens[2+3*tc->rslen+j]); + + if(reformat) + { + ofstream ofptr; + ofptr.open("reformat/debug_dump.txt",first_call ? ios::out : ios::app); + assert(ofptr.is_open()); + ofptr << tokens[0] << " "; + ofptr << tokens[1] << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->q[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->i[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->d[j]+33)); + ofptr << " "; + for(unsigned j=0;jrslen;++j) + ofptr << ((char)(tc->c[j]+33)); + ofptr << " 0 false\n"; + + ofptr.close(); + first_call = false; + } + + + return tokens.size(); +} + +double getCurrClk() { + struct timeval tv ; + gettimeofday(&tv, NULL); + return (double)tv.tv_sec + (double)tv.tv_usec / 1000000.0; +} + +inline unsigned long long rdtsc(void) +{ + unsigned hi, lo; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); +} + +void get_time(struct timespec* store_struct) +{ + clock_gettime(CLOCK_REALTIME, store_struct); +} + +uint64_t diff_time(struct timespec& prev_time) +{ + struct timespec curr_time; + clock_gettime(CLOCK_REALTIME, &curr_time); + return (uint64_t)((curr_time.tv_sec-prev_time.tv_sec)*1000000000+(curr_time.tv_nsec-prev_time.tv_nsec)); +} + + +#ifdef USE_PAPI +#include "papi.h" +#define NUM_PAPI_COUNTERS 4 +#endif + +void do_compute(char* filename, bool use_old_read_testcase, unsigned chunk_size, bool do_check) +{ + FILE* fptr = 0; + ifstream ifptr; + if(use_old_read_testcase) + { + fptr = fopen(filename,"r"); + assert(fptr); + } + else + { + ifptr.open(filename); + assert(ifptr.is_open()); + } + vector tc_vector; + tc_vector.clear(); + testcase tc; + uint64_t vector_compute_time = 0; + uint64_t baseline_compute_time = 0; + uint64_t num_double_calls = 0; + unsigned num_testcases = 0; + bool all_ok = do_check ? true : false; +#ifdef USE_PAPI + uint32_t all_mask = (0); + uint32_t no_usr_mask = (1 << 16); //bit 16 user mode, bit 17 kernel mode + uint32_t no_kernel_mask = (1 << 17); //bit 16 user mode, bit 17 kernel mode + PAPI_num_counters(); + int events[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 }; + char* eventnames[NUM_PAPI_COUNTERS]= { "cycles", "itlb_walk_cycles", "dtlb_load_walk_cycles", "dtlb_store_walk_cycles" }; + assert(PAPI_event_name_to_code("UNHALTED_REFERENCE_CYCLES:u=1:k=1",&(events[0])) == PAPI_OK); + assert(PAPI_event_name_to_code("ITLB_MISSES:WALK_DURATION", &(events[1])) == PAPI_OK); + assert(PAPI_event_name_to_code("DTLB_LOAD_MISSES:WALK_DURATION", &(events[2])) == PAPI_OK); + assert(PAPI_event_name_to_code("DTLB_STORE_MISSES:WALK_DURATION", &(events[3])) == PAPI_OK); + long long values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 }; + long long accum_values[NUM_PAPI_COUNTERS] = { 0, 0, 0, 0 }; +#endif + while(1) + { + int break_value = use_old_read_testcase ? read_testcase(&tc, fptr) : read_mod_testcase(ifptr,&tc,true); + if(break_value >= 0) + tc_vector.push_back(tc); + if(tc_vector.size() == BATCH_SIZE || (break_value < 0 && tc_vector.size() > 0)) + { + vector results_vec; + vector baseline_results_vec; + results_vec.clear(); + baseline_results_vec.clear(); + results_vec.resize(tc_vector.size()); + baseline_results_vec.resize(tc_vector.size()); + struct timespec start_time; +#ifdef USE_PAPI + assert(PAPI_start_counters(events, NUM_PAPI_COUNTERS) == PAPI_OK); +#endif + get_time(&start_time); +#pragma omp parallel for schedule(dynamic,chunk_size) num_threads(12) +#ifdef DO_REPEAT_PROFILING + for(unsigned z=0;z<10;++z) +#endif + { + for(unsigned i=0;i(&tc); + baseline_result = log10(baseline_result) - log10(ldexp(1.0, 1020.0)); + baseline_results_vec[i] = baseline_result; + } + baseline_compute_time += diff_time(start_time); + for(unsigned i=0;i 1e-5 && rel_error > 1e-5) + { + cout << std::scientific << baseline_result << " "< +std::string to_string(T obj) +{ + std::stringstream ss; + std::string ret_string; + ss.clear(); + ss << std::scientific << obj; + ss >> ret_string; + ss.clear(); + return ret_string; +} +void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline=true); + +int read_mod_testcase(std::ifstream& fptr, testcase* tc, bool reformat=false); + +bool is_avx_supported(); +bool is_sse42_supported(); +extern float (*g_compute_full_prob_float)(testcase *tc, float *before_last_log); +extern double (*g_compute_full_prob_double)(testcase *tc, double* before_last_log); +void debug_dump(std::string filename, std::string s, bool to_append, bool add_newline); +template +NUMBER compute_full_prob(testcase *tc, NUMBER *before_last_log=0); +double getCurrClk(); +void get_time(struct timespec* x); +uint64_t diff_time(struct timespec& prev_time); + +//bit 0 is sse4.2, bit 1 is AVX +enum ProcessorCapabilitiesEnum +{ + SSE41_CUSTOM_IDX=0, + SSE42_CUSTOM_IDX, + AVX_CUSTOM_IDX +}; +#define ENABLE_ALL_HARDWARE_FEATURES 0xFFFFFFFFFFFFFFFFull +uint64_t get_machine_capabilities(); +void initialize_function_pointers(uint64_t mask=ENABLE_ALL_HARDWARE_FEATURES); +void do_compute(char* filename, bool use_old_read_testcase=true, unsigned chunk_size=10000, bool do_check=true); + +//#define DO_WARMUP +//#define DO_REPEAT_PROFILING +//#define DUMP_COMPUTE_VALUES 1 +#define BATCH_SIZE 10000 +#define RUN_HYBRID + +#endif diff --git a/public/VectorPairHMM/src/main/c++/vector_defs.h b/public/VectorPairHMM/src/main/c++/vector_defs.h new file mode 100644 index 000000000..2aca9565f --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/vector_defs.h @@ -0,0 +1,55 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#undef SIMD_ENGINE +#undef SIMD_ENGINE_AVX +#undef SIMD_ENGINE_SSE + +#define SIMD_ENGINE avx +#define SIMD_ENGINE_AVX + +#include "define-float.h" +#include "vector_function_prototypes.h" + +#include "define-double.h" +#include "vector_function_prototypes.h" + +#undef SIMD_ENGINE +#undef SIMD_ENGINE_AVX + +#define SIMD_ENGINE sse +#define SIMD_ENGINE_SSE + + +#include "define-sse-float.h" +#include "vector_function_prototypes.h" + +#include "define-sse-double.h" +#include "vector_function_prototypes.h" + +#undef SIMD_ENGINE +#undef SIMD_ENGINE_AVX +#undef SIMD_ENGINE_SSE + diff --git a/public/VectorPairHMM/src/main/c++/vector_function_prototypes.h b/public/VectorPairHMM/src/main/c++/vector_function_prototypes.h new file mode 100644 index 000000000..c0fddc394 --- /dev/null +++ b/public/VectorPairHMM/src/main/c++/vector_function_prototypes.h @@ -0,0 +1,44 @@ +/*Copyright (c) 2012 The Broad Institute + +*Permission is hereby granted, free of charge, to any person +*obtaining a copy of this software and associated documentation +*files (the "Software"), to deal in the Software without +*restriction, including without limitation the rights to use, +*copy, modify, merge, publish, distribute, sublicense, and/or sell +*copies of the Software, and to permit persons to whom the +*Software is furnished to do so, subject to the following +*conditions: + +*The above copyright notice and this permission notice shall be +*included in all copies or substantial portions of the Software. + +*THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +*EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +*OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +*NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +*HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +*WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +*FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +*THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +inline void CONCAT(CONCAT(_vector_shift,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn, MAIN_TYPE &shiftOut); +inline void CONCAT(CONCAT(_vector_shift_last,SIMD_ENGINE), PRECISION) (UNION_TYPE &x, MAIN_TYPE shiftIn); +inline void CONCAT(CONCAT(precompute_masks_,SIMD_ENGINE), PRECISION)(const testcase& tc, int COLS, int numMaskVecs, MASK_TYPE (*maskArr)[NUM_DISTINCT_CHARS]); +inline void CONCAT(CONCAT(init_masks_for_row_,SIMD_ENGINE), PRECISION)(const testcase& tc, char* rsArr, MASK_TYPE* lastMaskShiftOut, int beginRowIndex, int numRowsToProcess); +inline void CONCAT(CONCAT(update_masks_for_cols_,SIMD_ENGINE), PRECISION)(int maskIndex, MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, MASK_TYPE (*maskArr) [NUM_DISTINCT_CHARS], char* rsArr, MASK_TYPE* lastMaskShiftOut, MASK_TYPE maskBitCnt); +inline void CONCAT(CONCAT(computeDistVec,SIMD_ENGINE), PRECISION) (MASK_VEC& currMaskVecLow, MASK_VEC& currMaskVecHigh, SIMD_TYPE& distm, SIMD_TYPE& _1_distm, SIMD_TYPE& distmChosen); +template inline void CONCAT(CONCAT(initializeVectors,SIMD_ENGINE), PRECISION)(int ROWS, int COLS, NUMBER* shiftOutM, NUMBER *shiftOutX, NUMBER *shiftOutY, Context ctx, testcase *tc, SIMD_TYPE *p_MM, SIMD_TYPE *p_GAPM, SIMD_TYPE *p_MX, SIMD_TYPE *p_XX, SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, SIMD_TYPE *distm1D); +template inline void CONCAT(CONCAT(stripINITIALIZATION,SIMD_ENGINE), PRECISION)( + int stripIdx, Context ctx, testcase *tc, SIMD_TYPE &pGAPM, SIMD_TYPE &pMM, SIMD_TYPE &pMX, SIMD_TYPE &pXX, SIMD_TYPE &pMY, SIMD_TYPE &pYY, + SIMD_TYPE &rs, UNION_TYPE &rsN, SIMD_TYPE &distm, SIMD_TYPE &_1_distm, SIMD_TYPE *distm1D, SIMD_TYPE N_packed256, SIMD_TYPE *p_MM , SIMD_TYPE *p_GAPM , + SIMD_TYPE *p_MX, SIMD_TYPE *p_XX , SIMD_TYPE *p_MY, SIMD_TYPE *p_YY, UNION_TYPE &M_t_2, UNION_TYPE &X_t_2, UNION_TYPE &M_t_1, UNION_TYPE &X_t_1, + UNION_TYPE &Y_t_2, UNION_TYPE &Y_t_1, UNION_TYPE &M_t_1_y, NUMBER* shiftOutX, NUMBER* shiftOutM); +inline SIMD_TYPE CONCAT(CONCAT(computeDISTM,SIMD_ENGINE), PRECISION)(int d, int COLS, testcase * tc, HAP_TYPE &hap, SIMD_TYPE rs, UNION_TYPE rsN, SIMD_TYPE N_packed256, + SIMD_TYPE distm, SIMD_TYPE _1_distm); +inline void CONCAT(CONCAT(computeMXY,SIMD_ENGINE), PRECISION)(UNION_TYPE &M_t, UNION_TYPE &X_t, UNION_TYPE &Y_t, UNION_TYPE &M_t_y, + UNION_TYPE M_t_2, UNION_TYPE X_t_2, UNION_TYPE Y_t_2, UNION_TYPE M_t_1, UNION_TYPE X_t_1, UNION_TYPE M_t_1_y, UNION_TYPE Y_t_1, + SIMD_TYPE pMM, SIMD_TYPE pGAPM, SIMD_TYPE pMX, SIMD_TYPE pXX, SIMD_TYPE pMY, SIMD_TYPE pYY, SIMD_TYPE distmSel); +template NUMBER CONCAT(CONCAT(compute_full_prob_,SIMD_ENGINE), PRECISION) (testcase *tc, NUMBER *before_last_log = NULL); + diff --git a/public/sting-root/pom.xml b/public/sting-root/pom.xml index 171eb7620..549a99ae6 100644 --- a/public/sting-root/pom.xml +++ b/public/sting-root/pom.xml @@ -335,7 +335,11 @@ maven-assembly-plugin 2.4 - + + org.apache.maven.plugins + maven-enforcer-plugin + 1.3.1 +