From d7954372020086206fb226eb620031c7a5c71b9c Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 4 Sep 2012 16:41:44 -0400 Subject: [PATCH 01/45] - New UserExceptions added for when ReadFilters or Walkers specified on the command line are not found. When -rf xxxx cannot find the class corresponding to xxxx, all read filters are printed in a better formatted way, with links to their gatk docs. - VariantAnnotatorEngine changed to call genotype annotations even if pilups and allele -> likelihood mappings are not present. Current genotype annotations altered to check for null pilupes and null mappings. --- .../sting/gatk/filters/FilterManager.java | 30 ++++++++++---- .../annotator/AlleleBalanceBySample.java | 3 ++ .../annotator/DepthPerAlleleBySample.java | 2 +- .../annotator/MappingQualityZeroBySample.java | 2 +- .../annotator/VariantAnnotatorEngine.java | 14 +++---- .../utils/classloader/PluginManager.java | 14 ++++++- .../sting/utils/exceptions/UserException.java | 12 ++++++ .../InvalidArgumentIntegrationTest.java | 41 +++++++++++++++++++ 8 files changed, 97 insertions(+), 21 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index bddfa6a0d..5ca8a1779 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -29,6 +29,7 @@ import com.google.common.base.Function; import com.google.common.collect.Collections2; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.help.GATKDocUtils; import java.util.Collection; import java.util.List; @@ -68,16 +69,29 @@ public class FilterManager extends PluginManager { @Override protected String formatErrorMessage(String pluginCategory, String pluginName) { List> availableFilters = this.getPluginsImplementing(ReadFilter.class); - Collection availableFilterNames = Collections2.transform(availableFilters, new Function,String>(){ - @Override - public String apply(final Class input) { - return getName(input); - } - }); - return String.format("Read filter %s not found. Available read filters:%n%s.%n%n%s",pluginName, - Utils.join(String.format(", "),availableFilterNames), + return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, + userFriendlyListofReadFilters(availableFilters), "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); } + + private String userFriendlyListofReadFilters(List> filters) { + final String headName = "FilterName", headDoc = "Documentation"; + int longestNameLength = -1; + for ( Class < ? extends ReadFilter> filter : filters ) { + longestNameLength = Math.max(longestNameLength,this.getName(filter).length()); + } + String format = " %"+longestNameLength+"s %s%n"; + + StringBuilder listBuilder = new StringBuilder(); + listBuilder.append(String.format(format,headName,headDoc)); + for ( Class filter : filters ) { + String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter); + String filterName = this.getName(filter); + listBuilder.append(String.format(format,filterName,helpLink)); + } + + return listBuilder.toString(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 0104f24d9..1e1f65333 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -33,6 +33,9 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ + if ( stratifiedContext == null ) + return; + Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 85387f7cf..ee9b51b56 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -54,7 +54,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) return; if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index 354b798bb..44657a7e7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -55,7 +55,7 @@ public class MappingQualityZeroBySample extends GenotypeAnnotation { final Genotype g, final GenotypeBuilder gb, final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ - if ( g == null || !g.isCalled() ) + if ( g == null || !g.isCalled() || stratifiedContext == null ) return; int mq0 = 0; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 22ec5468f..eae13e1b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -300,16 +300,12 @@ public class VariantAnnotatorEngine { if (stratifiedPerReadAlleleLikelihoodMap != null) perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( context == null && perReadAlleleLikelihoodMap == null) { - // no likelihoods nor pileup available: just move on to next sample - genotypes.add(genotype); - } else { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); - } - genotypes.add(gb.make()); + + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); } + genotypes.add(gb.make()); } return genotypes; diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 9f1b6db93..82fb6b8d6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.utils.classloader; import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -276,8 +278,16 @@ public class PluginManager { */ public PluginType createByName(String pluginName) { Class plugin = pluginsByName.get(pluginName); - if( plugin == null ) - throw new UserException(formatErrorMessage(pluginCategory,pluginName)); + if( plugin == null ) { + String errorMessage = formatErrorMessage(pluginCategory,pluginName); + if ( this.getClass().isAssignableFrom(FilterManager.class) ) { + throw new UserException.MalformedReadFilterException(errorMessage); + } else if ( this.getClass().isAssignableFrom(WalkerManager.class) ) { + throw new UserException.MalformedWalkerArgumentsException(errorMessage); + } else { + throw new UserException.CommandLineException(errorMessage); + } + } try { return plugin.newInstance(); } catch (Exception e) { diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 3130469e5..47a2f2f1d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -63,6 +63,18 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedReadFilterException extends CommandLineException { + public MalformedReadFilterException(String message) { + super(String.format("Malformed read filter: %s",message)); + } + } + + public static class MalformedWalkerArgumentsException extends CommandLineException { + public MalformedWalkerArgumentsException(String message) { + super(String.format("Malformed walker argument: %s",message)); + } + } + public static class MalformedGenomeLoc extends UserException { public MalformedGenomeLoc(String message, GenomeLoc loc) { super(String.format("Badly formed genome loc: %s: %s", message, loc)); diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..924c6ec5a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/31/12 + * Time: 11:03 AM + * To change this template use File | Settings | File Templates. + */ +public class InvalidArgumentIntegrationTest extends WalkerTest { + private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; + + private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { + return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s " + flag + " " + arg, + 1, exeption); + + } + + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s ", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} From fc06f39411563691b405887cbb030fb8791ee4e9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 5 Sep 2012 09:55:34 -0400 Subject: [PATCH 02/45] Fixed docs for Pileup walker --- .../broadinstitute/sting/gatk/walkers/Pileup.java | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 2a6ecdb8c..52c6e1560 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -45,19 +45,8 @@ import java.util.Collections; import java.util.List; /** - * Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping - * qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all - * encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand, - * a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the - * reverse strand. - * - * A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next - * reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence. - * Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. - * Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read - * separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality. - * A symbol '$' marks the end of a read segment. + * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, + * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. * * Associated command: * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] From 6e517df5d94141d3badc45f0ec0b7e65828fc158 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 5 Sep 2012 14:33:08 -0400 Subject: [PATCH 03/45] fixed a typo in StringText.properties --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 845fc68a6..5009698e1 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -380,7 +380,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem allelesToGenotype.removeAll( activeAllelesToGenotype ); } - if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do! + if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do TODO : YOSSI Write something smart!! if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! From ad5fa449e7e19c53875cbaaa2a21c78b360cecf8 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 5 Sep 2012 14:46:10 -0400 Subject: [PATCH 04/45] fixed a typo in the string comment --- .../sting/gatk/walkers/indels/RealignerTargetCreator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index a52d57031..b14dc9cc9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -57,7 +57,7 @@ import java.util.TreeSet; * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, - * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * it is impossible to place reads on the reference genome such that mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an @@ -69,7 +69,7 @@ import java.util.TreeSet; *
  • Running the realigner over those intervals (see the IndelRealigner tool)
  • * *

    - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + * An important note: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *

    * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. From d6884e705a06d734aed634f05a9e35026ab418b1 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 5 Sep 2012 15:21:00 -0400 Subject: [PATCH 05/45] Revert "fixed a typo in StringText.properties" This reverts commit b74c1c17e748f75e59d23545084b983e2a8d2fa6. --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 0537ca189..f4d8a88e0 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -380,7 +380,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem allelesToGenotype.removeAll( activeAllelesToGenotype ); } - if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do TODO : YOSSI Write something smart!! + if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do! if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! From e3b4cc02aa3d18a6f436093774356ceaffba6a46 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 07:26:01 -0400 Subject: [PATCH 06/45] Done GSA-282: Unindexed traversals crash if a read goes off the end of a contig -- Already fixed in the codebase. Added unindexed bam and integration tests to ensure this is fine going forward. --- .../walkers/PileupWalkerIntegrationTest.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 667b325ed..e16ef3125 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -16,13 +16,27 @@ public class PileupWalkerIntegrationTest extends WalkerTest { executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + + private final static String SingleReadAligningOffChromosome1MD5 = "4a45fe1f85aaa8c4158782f2b6dee2bd"; @Test public void testSingleReadAligningOffChromosome1() { String gatk_args = "-T Pileup " + " -I " + privateTestDir + "readOffb37contig1.bam" + " -R " + b37KGReference + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("4a45fe1f85aaa8c4158782f2b6dee2bd")); + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); executeTest("Testing single read spanning off chromosome 1", spec); } + + @Test + public void testSingleReadAligningOffChromosome1NoIndex() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.noIndex.bam" + + " -R " + b37KGReference + + " -U ALLOW_UNINDEXED_BAM" + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); + executeTest("Testing single read spanning off chromosome 1 unindexed", spec); + } } From 397a5551ef73e87971ba255c68a9f82b73d21490 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 19:54:33 -0400 Subject: [PATCH 07/45] More memory for gatkdocs and extracthelp targets --- build.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.xml b/build.xml index f681ddafa..0d1deba29 100644 --- a/build.xml +++ b/build.xml @@ -577,6 +577,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet"> @@ -780,6 +781,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet"> From 6055101df8965a3391a19fe686edb8ba85f10487 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 31 Aug 2012 20:10:26 -0400 Subject: [PATCH 08/45] NanoScheduler no longer groups inputs, each map() call is interlaced now -- Maximizes the efficiency of the threads -- Simplifies interface (yea!) -- Reduces number of combinatorial tests that need to be performed --- .../gatk/traversals/TraverseReadsNano.java | 4 +- .../utils/nanoScheduler/NanoScheduler.java | 76 ++++++------------- .../nanoScheduler/NanoSchedulerUnitTest.java | 41 +++++----- 3 files changed, 43 insertions(+), 78 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index b397cb8c0..dbddeb092 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -55,13 +55,11 @@ public class TraverseReadsNano extends TraversalEngine, /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - private static final int MIN_GROUP_SIZE = 100; final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - final int mapGroupSize = (int)Math.max(Math.ceil(bufferSize / 50.0 + 1), MIN_GROUP_SIZE); - nanoScheduler = new NanoScheduler(bufferSize, mapGroupSize, nThreads); + nanoScheduler = new NanoScheduler(bufferSize, nThreads); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 668c82524..5c6aa6a35 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; @@ -47,7 +46,6 @@ public class NanoScheduler { private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; final int bufferSize; - final int mapGroupSize; final int nThreads; final ExecutorService executor; boolean shutdown = false; @@ -57,29 +55,15 @@ public class NanoScheduler { * Create a new nanoschedule with the desire characteristics requested by the argument * * @param bufferSize the number of input elements to read in each scheduling cycle. - * @param mapGroupSize How many inputs should be grouped together per map? If -1 we make a reasonable guess * @param nThreads the number of threads to use to get work done, in addition to the thread calling execute */ public NanoScheduler(final int bufferSize, - final int mapGroupSize, final int nThreads) { if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); - if ( mapGroupSize > bufferSize ) throw new IllegalArgumentException("mapGroupSize " + mapGroupSize + " must be <= bufferSize " + bufferSize); - if ( mapGroupSize == 0 || mapGroupSize < -1 ) throw new IllegalArgumentException("mapGroupSize cannot be <= 0" + mapGroupSize); - this.bufferSize = bufferSize; this.nThreads = nThreads; - - if ( mapGroupSize == -1 ) { - this.mapGroupSize = (int)Math.ceil(this.bufferSize / (10.0*this.nThreads)); - logger.info(String.format("Dynamically setting grouping size to %d based on buffer size %d and n threads %d", - this.mapGroupSize, this.bufferSize, this.nThreads)); - } else { - this.mapGroupSize = mapGroupSize; - } - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); } @@ -101,15 +85,6 @@ public class NanoScheduler { return bufferSize; } - /** - * The grouping size used by this NanoScheduler - * @return - */ - @Ensures("result > 0") - public int getMapGroupSize() { - return mapGroupSize; - } - /** * Tells this nanoScheduler to shutdown immediately, releasing all its resources. * @@ -214,10 +189,10 @@ public class NanoScheduler { final List inputs = readInputs(inputReader); // send jobs for map - final Queue>> mapQueue = submitMapJobs(map, executor, inputs); + final Queue> mapQueue = submitMapJobs(map, executor, inputs); // send off the reduce job, and block until we get at least one reduce result - sum = reduceParallel(reduce, mapQueue, sum); + sum = reduceSerial(reduce, mapQueue, sum); } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -229,16 +204,16 @@ public class NanoScheduler { } @Requires({"reduce != null", "! mapQueue.isEmpty()"}) - private ReduceType reduceParallel(final ReduceFunction reduce, - final Queue>> mapQueue, - final ReduceType initSum) + private ReduceType reduceSerial(final ReduceFunction reduce, + final Queue> mapQueue, + final ReduceType initSum) throws InterruptedException, ExecutionException { ReduceType sum = initSum; // while mapQueue has something in it to reduce - for ( final Future> future : mapQueue ) { - for ( final MapType value : future.get() ) // block until we get the values for this task - sum = reduce.apply(value, sum); + for ( final Future future : mapQueue ) { + final MapType value = future.get(); // block until we get the values for this task + sum = reduce.apply(value, sum); } return sum; @@ -247,7 +222,7 @@ public class NanoScheduler { /** * Read up to inputBufferSize elements from inputReader * - * @return a queue of inputs read in, containing one or more values of InputType read in + * @return a queue of input read in, containing one or more values of InputType read in */ @Requires("inputReader.hasNext()") @Ensures("!result.isEmpty()") @@ -263,14 +238,14 @@ public class NanoScheduler { } @Requires({"map != null", "! inputs.isEmpty()"}) - private Queue>> submitMapJobs(final MapFunction map, - final ExecutorService executor, - final List inputs) { - final Queue>> mapQueue = new LinkedList>>(); + private Queue> submitMapJobs(final MapFunction map, + final ExecutorService executor, + final List inputs) { + final Queue> mapQueue = new LinkedList>(); - for ( final List subinputs : Utils.groupList(inputs, getMapGroupSize()) ) { - final CallableMap doMap = new CallableMap(map, subinputs); - final Future> future = executor.submit(doMap); + for ( final InputType input : inputs ) { + final CallableMap doMap = new CallableMap(map, input); + final Future future = executor.submit(doMap); mapQueue.add(future); } @@ -280,23 +255,18 @@ public class NanoScheduler { /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Callable> { - final List inputs; + private class CallableMap implements Callable { + final InputType input; final MapFunction map; - @Requires({"map != null", "inputs.size() <= getMapGroupSize()"}) - private CallableMap(final MapFunction map, final List inputs) { - this.inputs = inputs; + @Requires({"map != null"}) + private CallableMap(final MapFunction map, final InputType inputs) { + this.input = inputs; this.map = map; } - @Ensures("result.size() == inputs.size()") - @Override public List call() throws Exception { - final List outputs = new LinkedList(); - for ( final InputType input : inputs ) - outputs.add(map.apply(input)); - debugPrint(" Processed %d elements with map", outputs.size()); - return outputs; + @Override public MapType call() throws Exception { + return map.apply(input); } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 89506dcb1..1dcc243f2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -5,7 +5,10 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; /** * UnitTests for the NanoScheduler @@ -39,18 +42,17 @@ public class NanoSchedulerUnitTest extends BaseTest { } private static class NanoSchedulerBasicTest extends TestDataProvider { - final int bufferSize, mapGroupSize, nThreads, start, end, expectedResult; + final int bufferSize, nThreads, start, end, expectedResult; - public NanoSchedulerBasicTest(final int bufferSize, final int mapGroupSize, final int nThreads, final int start, final int end) { + public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end) { super(NanoSchedulerBasicTest.class); this.bufferSize = bufferSize; - this.mapGroupSize = mapGroupSize; this.nThreads = nThreads; this.start = start; this.end = end; this.expectedResult = sum2x(start, end); - setName(String.format("%s nt=%d buf=%d mapGroupSize=%d start=%d end=%d sum=%d", - getClass().getSimpleName(), nThreads, bufferSize, mapGroupSize, start, end, expectedResult)); + setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d", + getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult)); } public Iterator makeReader() { @@ -69,14 +71,10 @@ public class NanoSchedulerUnitTest extends BaseTest { @DataProvider(name = "NanoSchedulerBasicTest") public Object[][] createNanoSchedulerBasicTest() { for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { - for ( final int mapGroupSize : Arrays.asList(-1, 1, 10, 100, 1000) ) { - if ( mapGroupSize <= bufferSize ) { - for ( final int nt : Arrays.asList(1, 2, 4) ) { - for ( final int start : Arrays.asList(0) ) { - for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { - exampleTest = new NanoSchedulerBasicTest(bufferSize, mapGroupSize, nt, start, end); - } - } + for ( final int nt : Arrays.asList(1, 2, 4) ) { + for ( final int start : Arrays.asList(0) ) { + for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { + exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end); } } } @@ -101,10 +99,9 @@ public class NanoSchedulerUnitTest extends BaseTest { private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); - Assert.assertTrue(nanoScheduler.getMapGroupSize() >= test.mapGroupSize, "mapGroupSize argument"); Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); @@ -115,11 +112,11 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(enabled = true, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { - if ( test.bufferSize > 1 && (test.mapGroupSize > 1 || test.mapGroupSize == -1)) { + if ( test.bufferSize > 1) { logger.warn("Running " + test); final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); // test reusing the scheduler for ( int i = 0; i < 10; i++ ) { @@ -134,7 +131,7 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdown() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); nanoScheduler.shutdown(); Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); @@ -142,15 +139,15 @@ public class NanoSchedulerUnitTest extends BaseTest { @Test(expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) public void testShutdownExecuteFailure() throws InterruptedException { - final NanoScheduler nanoScheduler = new NanoScheduler(1, 1, 2); + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); nanoScheduler.shutdown(); nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); } public static void main(String [ ] args) { - final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, 100, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); final NanoScheduler nanoScheduler = - new NanoScheduler(test.bufferSize, test.mapGroupSize, test.nThreads); + new NanoScheduler(test.bufferSize, test.nThreads); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); From e01258b2615609e925f2deb3dd886bae6b08402a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 11:51:31 -0400 Subject: [PATCH 09/45] NanoScheduler now supports printProgress. Bugfixes to printProgress -- TraverseReadsNano prints progress at the end of each traversal unit -- Fix bugs in TraversalEngine printProgress -- Synchronize the method so we don't get multiple logged outputs when two or more HMSs call printProgress before initialization at the start! -- Fix the logic for mustPrint, which actually had the logic of mustNotPrint. Now we see the done log line that was always supposed to be there -- Fix output formatting, as the done() line was incorrectly shifting over the % complete by 1 char as 100.0% didn't fit in %4.1f -- Add clearer doc on -PF argument so that people know that the performance log can be generated to standard out if one wants --- .../arguments/GATKArgumentCollection.java | 10 +++++- .../gatk/traversals/TraversalEngine.java | 33 +++++++++++++++---- .../gatk/traversals/TraverseReadsNano.java | 18 +++++++--- 3 files changed, 48 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 72cb5e02f..6be66b204 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -41,7 +41,9 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; import java.io.File; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; /** * @author aaron @@ -197,6 +199,12 @@ public class GATKArgumentCollection { // performance log arguments // // -------------------------------------------------------------------------------------------------------------- + + /** + * The file name for the GATK performance log output, or null if you don't want to generate the + * detailed performance logging table. This table is suitable for importing into R or any + * other analysis software that can read tsv files + */ @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) public File performanceLog = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index abc71e549..198f9342e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -189,12 +189,26 @@ public abstract class TraversalEngine,Provide /** * Forward request to printProgress * + * Assumes that one cycle has been completed + * * @param shard the given shard currently being processed. * @param loc the location */ public void printProgress(Shard shard, GenomeLoc loc) { // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false); + printProgress(loc,shard.getReadMetrics(),false, 1); + } + + /** + * Forward request to printProgress + * + * @param shard the given shard currently being processed. + * @param loc the location + * @param nElapsedCycles the number of cycles (turns of map) that have occurred since the last call + */ + public void printProgress(Shard shard, GenomeLoc loc, int nElapsedCycles) { + // A bypass is inserted here for unit testing. + printProgress(loc,shard.getReadMetrics(),false, nElapsedCycles); } /** @@ -205,12 +219,16 @@ public abstract class TraversalEngine,Provide * @param metrics Data processed since the last cumulative * @param mustPrint If true, will print out info, regardless of nRecords or time interval */ - private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) { - if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 ) + private synchronized void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint, int nElapsedCycles) { + final int previousPrintCycle = printProgressCheckCounter / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; + final int newPrintCycle = (printProgressCheckCounter+nElapsedCycles) / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; + + printProgressCheckCounter += nElapsedCycles; // keep track of our number of cycles through printProgress + if ( newPrintCycle == previousPrintCycle && ! mustPrint ) // don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES return; - if(!progressMeterInitialized && mustPrint == false ) { + if( ! progressMeterInitialized ) { logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", "Location", getTraversalType(), getTraversalType())); @@ -250,8 +268,9 @@ public abstract class TraversalEngine,Provide else PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds - logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s", - loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate, + final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : Integer.toString(loc.getStart()); + logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", + posName, nRecords*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); } @@ -309,7 +328,7 @@ public abstract class TraversalEngine,Provide * Called after a traversal to print out information about the traversal process */ public void printOnTraversalDone() { - printProgress(null, null, true); + printProgress(null, null, true, 1); final double elapsed = timer == null ? 0 : timer.getElapsedTime(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index dbddeb092..2ada8bbfa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadView; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; @@ -87,9 +88,15 @@ public class TraverseReadsNano extends TraversalEngine, final TraverseReadsMap myMap = new TraverseReadsMap(walker); final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); - T result = nanoScheduler.execute(aggregateMapData(dataProvider).iterator(), myMap, sum, myReduce); - // TODO -- how do we print progress? - //printProgress(dataProvider.getShard(), ???); + final List aggregatedInputs = aggregateMapData(dataProvider); + final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce); + + final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; + final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); + printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); + + // TODO -- how can I get done value? + // done = walker.isDone(); return result; } @@ -165,8 +172,9 @@ public class TraverseReadsNano extends TraversalEngine, return walker.map(data.refContext, data.read, data.tracker); } } - - return null; // TODO -- what should we return in the case where the walker is done or the read is filtered? + // TODO -- how can we cleanly support done and filtered. Need to return + // TODO -- a MapResult object that says the status + return null; } } } From 7087b22ea397c96a78a9dbc2bc98558d80343cea Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 12:28:54 -0400 Subject: [PATCH 10/45] No debugging output (even conditional) for ReadTransformers in PrintReads --- .../src/org/broadinstitute/sting/gatk/walkers/PrintReads.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index a5d4b45b6..4118617fc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -228,7 +228,6 @@ public class PrintReads extends ReadWalker impleme GATKSAMRecord workingRead = readIn; for ( final ReadTransformer transformer : readTransformers ) { - if ( logger.isDebugEnabled() ) logger.debug("Applying transformer " + transformer + " to read " + readIn.getReadName()); workingRead = transformer.apply(workingRead); } From 800a27c3a701bef87bd8210b0dddf080c1555068 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 1 Sep 2012 12:29:59 -0400 Subject: [PATCH 11/45] NanoScheduler tracks time within input, map, and reduce -- Helpful for understanding where the time goes to each bit of the code. -- Controlled by a local static boolean, to avoid the potential overhead in general --- .../utils/nanoScheduler/NanoScheduler.java | 40 ++++++++++++++++--- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 5c6aa6a35..39b541944 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; @@ -44,6 +46,7 @@ import java.util.concurrent.*; public class NanoScheduler { private final static Logger logger = Logger.getLogger(NanoScheduler.class); private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; + private final static boolean TIME_CALLS = true; final int bufferSize; final int nThreads; @@ -51,6 +54,10 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; + final SimpleTimer inputTimer = new SimpleTimer(); + final SimpleTimer mapTimer = new SimpleTimer(); + final SimpleTimer reduceTimer = new SimpleTimer(); + /** * Create a new nanoschedule with the desire characteristics requested by the argument * @@ -97,6 +104,19 @@ public class NanoScheduler { throw new IllegalStateException("Remaining tasks found in the executor, unexpected behavior!"); } shutdown = true; + + if (TIME_CALLS) { + printTimerInfo("Input time", inputTimer); + printTimerInfo("Map time", mapTimer); + printTimerInfo("Reduce time", reduceTimer); + } + } + + private void printTimerInfo(final String label, final SimpleTimer timer) { + final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + reduceTimer.getElapsedTime(); + final double myTimeInSec = timer.getElapsedTime(); + final double myTimePercent = myTimeInSec / total * 100; + logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); } /** @@ -134,10 +154,10 @@ public class NanoScheduler { * It is safe to call this function repeatedly on a single nanoScheduler, at least until the * shutdown method is called. * - * @param inputReader - * @param map - * @param reduce - * @return + * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over + * @param map the map function from input type -> map type, will be applied in parallel to each input + * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results + * @return the last reduce value */ public ReduceType execute(final Iterator inputReader, final MapFunction map, @@ -213,7 +233,10 @@ public class NanoScheduler { // while mapQueue has something in it to reduce for ( final Future future : mapQueue ) { final MapType value = future.get(); // block until we get the values for this task + + if ( TIME_CALLS) reduceTimer.restart(); sum = reduce.apply(value, sum); + if ( TIME_CALLS) reduceTimer.stop(); } return sum; @@ -229,11 +252,15 @@ public class NanoScheduler { private List readInputs(final Iterator inputReader) { int n = 0; final List inputs = new LinkedList(); + + if ( TIME_CALLS) inputTimer.restart(); while ( inputReader.hasNext() && n < getBufferSize() ) { final InputType input = inputReader.next(); inputs.add(input); n++; } + if ( TIME_CALLS) inputTimer.stop(); + return inputs; } @@ -266,7 +293,10 @@ public class NanoScheduler { } @Override public MapType call() throws Exception { - return map.apply(input); + if ( TIME_CALLS) mapTimer.restart(); + final MapType result = map.apply(input); + if ( TIME_CALLS) mapTimer.stop(); + return result; } } } From 59109d5eeb8798bb2c6eabf4b987837fc693b951 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 10:54:18 -0400 Subject: [PATCH 12/45] NanoScheduler tracks time outside of its execute call --- .../utils/nanoScheduler/NanoScheduler.java | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 39b541944..a6be6ad6d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -54,6 +54,7 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; + final SimpleTimer outsideSchedulerTimer = new SimpleTimer(); final SimpleTimer inputTimer = new SimpleTimer(); final SimpleTimer mapTimer = new SimpleTimer(); final SimpleTimer reduceTimer = new SimpleTimer(); @@ -72,6 +73,9 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); + + // start timing the time spent outside of the nanoScheduler + outsideSchedulerTimer.start(); } /** @@ -98,6 +102,8 @@ public class NanoScheduler { * After this call, execute cannot be invoked without throwing an error */ public void shutdown() { + outsideSchedulerTimer.stop(); + if ( executor != null ) { final List remaining = executor.shutdownNow(); if ( ! remaining.isEmpty() ) @@ -106,14 +112,16 @@ public class NanoScheduler { shutdown = true; if (TIME_CALLS) { - printTimerInfo("Input time", inputTimer); - printTimerInfo("Map time", mapTimer); - printTimerInfo("Reduce time", reduceTimer); + printTimerInfo("Input time", inputTimer); + printTimerInfo("Map time", mapTimer); + printTimerInfo("Reduce time", reduceTimer); + printTimerInfo("Outside time", outsideSchedulerTimer); } } private void printTimerInfo(final String label, final SimpleTimer timer) { - final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + reduceTimer.getElapsedTime(); + final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); final double myTimeInSec = timer.getElapsedTime(); final double myTimePercent = myTimeInSec / total * 100; logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); @@ -168,11 +176,16 @@ public class NanoScheduler { if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); + outsideSchedulerTimer.stop(); + ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { - return executeSingleThreaded(inputReader, map, initialValue, reduce); + result = executeSingleThreaded(inputReader, map, initialValue, reduce); } else { - return executeMultiThreaded(inputReader, map, initialValue, reduce); + result = executeMultiThreaded(inputReader, map, initialValue, reduce); } + + outsideSchedulerTimer.restart(); + return result; } /** From 6a5a70cdf1a80751d1fe54594c0d0d2ee6a3fa87 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 11:37:30 -0400 Subject: [PATCH 13/45] Done GSA-539: SimpleTimer should use System.nanoTime for nanoSecond resolution --- .../sting/utils/SimpleTimer.java | 89 ++++++++++++++----- .../sting/utils/SimpleTimerUnitTest.java | 63 ++++++++++++- 2 files changed, 128 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java index 15d34a348..b3a9986c5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java +++ b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java @@ -1,18 +1,42 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.concurrent.TimeUnit; + /** - * A useful simple system for timing code. This code is not thread safe! + * A useful simple system for timing code with nano second resolution + * + * Note that this code is not thread-safe. If you have a single timer + * being started and stopped by multiple threads you will need to protect the + * calls to avoid meaningless results of having multiple starts and stops + * called sequentially. * * User: depristo * Date: Dec 10, 2010 * Time: 9:07:44 AM */ public class SimpleTimer { - final private String name; - private long elapsed = 0l; - private long startTime = 0l; - boolean running = false; + protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); + private final String name; + + /** + * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the + * sum of times between starts/restrats and stops. + */ + private long elapsedTimeNano = 0l; + + /** + * The start time of the last start/restart in nanoSeconds + */ + private long startTimeNano = 0l; + + /** + * Is this timer currently running (i.e., the last call was start/restart) + */ + private boolean running = false; /** * Creates an anonymous simple timer @@ -25,7 +49,8 @@ public class SimpleTimer { * Creates a simple timer named name * @param name of the timer, must not be null */ - public SimpleTimer(String name) { + public SimpleTimer(final String name) { + if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); this.name = name; } @@ -37,27 +62,27 @@ public class SimpleTimer { } /** - * Starts the timer running, and sets the elapsed time to 0. This is equivalent to + * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to * resetting the time to have no history at all. * * @return this object, for programming convenience */ + @Ensures("elapsedTimeNano == 0l") public synchronized SimpleTimer start() { - elapsed = 0l; - restart(); - return this; + elapsedTimeNano = 0l; + return restart(); } /** - * Starts the timer running, without reseting the elapsed time. This function may be + * Starts the timer running, without resetting the elapsedTimeNano time. This function may be * called without first calling start(). The only difference between start and restart - * is that start resets the elapsed time, while restart does not. + * is that start resets the elapsedTimeNano time, while restart does not. * * @return this object, for programming convenience */ public synchronized SimpleTimer restart() { running = true; - startTime = currentTime(); + startTimeNano = currentTimeNano(); return this; } @@ -71,29 +96,53 @@ public class SimpleTimer { /** * @return A convenience function to obtain the current time in milliseconds from this timer */ - public synchronized long currentTime() { + public long currentTime() { return System.currentTimeMillis(); } /** - * Stops the timer. Increases the elapsed time by difference between start and now. The - * timer must be running in order to call stop + * @return A convenience function to obtain the current time in nanoSeconds from this timer + */ + public long currentTimeNano() { + return System.nanoTime(); + } + + /** + * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. + * + * It's ok to call stop on a timer that's not running. It has no effect on the timer. * * @return this object, for programming convenience */ + @Requires("startTimeNano != 0l") public synchronized SimpleTimer stop() { - running = false; - elapsed += currentTime() - startTime; + if ( running ) { + running = false; + elapsedTimeNano += currentTimeNano() - startTimeNano; + } return this; } /** - * Returns the total elapsed time of all start/stops of this timer. If the timer is currently + * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently * running, includes the difference from currentTime() and the start as well * * @return this time, in seconds */ public synchronized double getElapsedTime() { - return (running ? (currentTime() - startTime + elapsed) : elapsed) / 1000.0; + return nanoToSecondsAsDouble(getElapsedTimeNano()); + } + + protected static double nanoToSecondsAsDouble(final long nano) { + return nano * NANO_TO_SECOND_DOUBLE; + } + + /** + * @see #getElapsedTime() but returns the result in nanoseconds + * + * @return the elapsed time in nanoseconds + */ + public synchronized long getElapsedTimeNano() { + return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 7a2696b7b..7285c00ac 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -1,12 +1,12 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; public class SimpleTimerUnitTest extends BaseTest { private final static String NAME = "unit.test.timer"; @@ -17,33 +17,88 @@ public class SimpleTimerUnitTest extends BaseTest { Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); + Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); t.start(); Assert.assertTrue(t.isRunning(), "Started timer isn't running"); Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); + Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); + long n1 = t.getElapsedTimeNano(); double t1 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time + long n2 = t.getElapsedTimeNano(); double t2 = t.getElapsedTime(); Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); + Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); t.stop(); Assert.assertFalse(t.isRunning(), "Stopped timer still running"); + long n3 = t.getElapsedTimeNano(); double t3 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time double t4 = t.getElapsedTime(); + long n4 = t.getElapsedTimeNano(); Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); + Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); t.restart(); idleLoop(); // idle loop to wait a tiny bit of time double t5 = t.getElapsedTime(); + long n5 = t.getElapsedTimeNano(); Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); idleLoop(); // idle loop to wait a tiny bit of time double t6 = t.getElapsedTime(); + long n6 = t.getElapsedTimeNano(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); + Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); + Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); + + final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); + final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); + for ( int i = 0; i < nanoTimes.size(); i++ ) + Assert.assertEquals( + SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), + secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); } - private final static void idleLoop() { + @Test + public void testNanoResolution() { + SimpleTimer t = new SimpleTimer(NAME); + + // test the nanosecond resolution + long n7 = t.currentTimeNano(); + int sum = 0; + for ( int i = 0; i < 100; i++) sum += i; + long n8 = t.currentTimeNano(); + final long delta = n8 - n7; + final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); + logger.warn("nanoTime before nano operation " + n7); + logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); + Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); + Assert.assertTrue(delta < oneMilliInNano, + "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); + } + + @Test + public void testMeaningfulTimes() { + SimpleTimer t = new SimpleTimer(NAME); + + t.start(); + for ( int i = 0; i < 100; i++ ) ; + long nano = t.getElapsedTimeNano(); + double secs = t.getElapsedTime(); + + Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); + Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); + + Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); + final long maxTimeInMicro = 100; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(100); + Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); + } + + private static void idleLoop() { for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time } } \ No newline at end of file From 1a8f5fc374994b06f16d2a6cc987a2720d42b144 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 11:53:59 -0400 Subject: [PATCH 14/45] Trivial cleanup of NanoScheduler --- .../sting/utils/nanoScheduler/NanoScheduler.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index a6be6ad6d..1ef4d3950 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -54,10 +54,10 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; - final SimpleTimer outsideSchedulerTimer = new SimpleTimer(); - final SimpleTimer inputTimer = new SimpleTimer(); - final SimpleTimer mapTimer = new SimpleTimer(); - final SimpleTimer reduceTimer = new SimpleTimer(); + final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); + final SimpleTimer inputTimer = new SimpleTimer("input"); + final SimpleTimer mapTimer = new SimpleTimer("map"); + final SimpleTimer reduceTimer = new SimpleTimer("reduce"); /** * Create a new nanoschedule with the desire characteristics requested by the argument From 9823102c0cceea72beb0db689631a1ebeade9978 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 2 Sep 2012 12:16:56 -0400 Subject: [PATCH 15/45] TraverseReadsNano supports walker.filter and walker.done -- Instead of returning directly the result of map(), returns a MapResult object with the value and a reduceMe flag. -- Reduce function respects the reduceMe flag -- Code cleanup and more documentation --- .../gatk/traversals/TraverseReadsNano.java | 120 +++++++++++++----- 1 file changed, 86 insertions(+), 34 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 2ada8bbfa..4bb700c37 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -40,27 +40,28 @@ import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.ArrayList; +import java.util.LinkedList; import java.util.List; /** - * @author aaron + * A nano-scheduling version of TraverseReads. + * + * Implements the traversal of a walker that accepts individual reads, the reference, and + * RODs per map call. Directly supports shared memory parallelism via NanoScheduler + * + * @author depristo * @version 1.0 - * @date Apr 24, 2009 - *

    - * Class TraverseReads - *

    - * This class handles traversing by reads in the new shardable style + * @date 9/2/2012 */ public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; - final NanoScheduler nanoScheduler; + final NanoScheduler nanoScheduler; public TraverseReadsNano(int nThreads) { final int bufferSize = ReadShard.getReadBufferSize() + 1; // actually has 1 more than max - nanoScheduler = new NanoScheduler(bufferSize, nThreads); + nanoScheduler = new NanoScheduler(bufferSize, nThreads); } @Override @@ -95,18 +96,23 @@ public class TraverseReadsNano extends TraversalEngine, final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); - // TODO -- how can I get done value? - // done = walker.isDone(); - return result; } + /** + * Aggregate all of the inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + * + * @param dataProvider the source of our data + * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce + * should execute + */ private List aggregateMapData(final ReadShardDataProvider dataProvider) { final ReadView reads = new ReadView(dataProvider); final ReadReferenceView reference = new ReadReferenceView(dataProvider); final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - final List mapData = new ArrayList(); // TODO -- need size of reads + final List mapData = new LinkedList(); for ( final SAMRecord read : reads ) { final ReferenceContext refContext = ! read.getReadUnmappedFlag() ? reference.getReferenceContext(read) @@ -132,19 +138,9 @@ public class TraverseReadsNano extends TraversalEngine, super.printOnTraversalDone(); } - private class TraverseReadsReduce implements ReduceFunction { - final ReadWalker walker; - - private TraverseReadsReduce(ReadWalker walker) { - this.walker = walker; - } - - @Override - public T apply(M one, T sum) { - return walker.reduce(one, sum); - } - } - + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ private class MapData { final GATKSAMRecord read; final ReferenceContext refContext; @@ -157,7 +153,43 @@ public class TraverseReadsNano extends TraversalEngine, } } - private class TraverseReadsMap implements MapFunction { + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseReadsMap implements MapFunction { final ReadWalker walker; private TraverseReadsMap(ReadWalker walker) { @@ -165,16 +197,36 @@ public class TraverseReadsNano extends TraversalEngine, } @Override - public M apply(final MapData data) { + public MapResult apply(final MapData data) { if ( ! walker.isDone() ) { final boolean keepMeP = walker.filter(data.refContext, data.read); - if (keepMeP) { - return walker.map(data.refContext, data.read, data.tracker); - } + if (keepMeP) + return new MapResult(walker.map(data.refContext, data.read, data.tracker)); } - // TODO -- how can we cleanly support done and filtered. Need to return - // TODO -- a MapResult object that says the status - return null; + + return SKIP_REDUCE; + } + } + + /** + * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseReadsReduce implements ReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; } } } From d7105223fe7d8bb4848dbc2cfe7ccfbb9709b4b6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 13:44:39 -0400 Subject: [PATCH 16/45] More debugging output for NanoScheduler when debugging is enabled --- .../sting/utils/nanoScheduler/NanoScheduler.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 1ef4d3950..f0e77354f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -220,12 +220,14 @@ public class NanoScheduler { try { // read in our input values final List inputs = readInputs(inputReader); + debugPrint("Enqueuing " + inputs.size() + " elements to map"); // send jobs for map final Queue> mapQueue = submitMapJobs(map, executor, inputs); // send off the reduce job, and block until we get at least one reduce result sum = reduceSerial(reduce, mapQueue, sum); + debugPrint(" Done with cycle of map/reduce"); } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -307,6 +309,7 @@ public class NanoScheduler { @Override public MapType call() throws Exception { if ( TIME_CALLS) mapTimer.restart(); + if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); if ( TIME_CALLS) mapTimer.stop(); return result; From 757e6a016081205e3c78c71ed184c982d63910f6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 13:45:57 -0400 Subject: [PATCH 17/45] Making Pileup thread-safe -- Old version relied on out printstream magically sorting output, new version puts the print in reduce --- .../sting/gatk/walkers/Pileup.java | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 52c6e1560..607c83966 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -52,7 +52,7 @@ import java.util.List; * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible { +public class Pileup extends LocusWalker implements TreeReducible { private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names @@ -70,27 +70,32 @@ public class Pileup extends LocusWalker implements TreeReducib @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) public List> rods = Collections.emptyList(); - public void initialize() { - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - String rods = getReferenceOrderedData( tracker ); + @Override + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String rods = getReferenceOrderedData( tracker ); ReadBackedPileup basePileup = context.getBasePileup(); - out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); - if ( SHOW_VERBOSE ) - out.printf(" %s", createVerboseOutput(basePileup)); - out.println(); - return 1; + final StringBuilder s = new StringBuilder(); + s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); + if ( SHOW_VERBOSE ) + s.append(" ").append(createVerboseOutput(basePileup)); + s.append("\n"); + + return s.toString(); } // Given result of map function + @Override public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { - return treeReduce(sum,value); + + @Override + public Integer reduce(String value, Integer sum) { + out.print(value); + return sum + 1; } + + @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } From d503ed97abd7a4990e3412aa8a934ff0761e847b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 13:47:40 -0400 Subject: [PATCH 18/45] Mark I NanoScheduling TraverseLoci -- Refactored TraverseLoci into old linear version and nano scheduling version -- Temp. GATK argument to say how many nano threads to use -- Can efficiently scale to 3 threads before blocking on input --- .../sting/gatk/ReadMetrics.java | 10 +- .../arguments/GATKArgumentCollection.java | 4 + .../sting/gatk/executive/MicroScheduler.java | 3 +- ...raverseLoci.java => TraverseLociBase.java} | 70 +++--- .../gatk/traversals/TraverseLociLinear.java | 48 +++++ .../gatk/traversals/TraverseLociNano.java | 200 ++++++++++++++++++ 6 files changed, 293 insertions(+), 42 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/traversals/{TraverseLoci.java => TraverseLociBase.java} (57%) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java index ceaa30f01..bfea0b1e1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk; import net.sf.picard.filter.SamRecordFilter; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; @@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable { return nRecords; } + /** + * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed. + */ + public void incrementNumIterations(final long by) { + nRecords += by; + } + /** * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed. */ public void incrementNumIterations() { - nRecords++; + incrementNumIterations(1); } public long getNumReadsSeen() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 6be66b204..33400bd9e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -313,6 +313,10 @@ public class GATKArgumentCollection { @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) public Integer numberOfBAMFileHandles = null; + @Argument(fullName="nanoThreads", shortName = "nanoThreads", doc="NanoThreading", required = false) + @Hidden + public int nanoThreads = 1; + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 417a0982f..073a46ee3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -146,7 +146,8 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { if (walker instanceof ReadWalker) { traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads(); } else if (walker instanceof LocusWalker) { - traversalEngine = new TraverseLoci(); + // TODO -- refactor to use better interface + traversalEngine = engine.getArguments().nanoThreads > 1 ? new TraverseLociNano(engine.getArguments().nanoThreads) : new TraverseLociLinear(); } else if (walker instanceof DuplicateWalker) { traversalEngine = new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java similarity index 57% rename from public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java rename to public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java index a5a6919a2..19d95381e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociBase.java @@ -3,9 +3,7 @@ package org.broadinstitute.sting.gatk.traversals; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -15,28 +13,42 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; /** * A simple solution to iterating over all reference positions over a series of genomic locations. */ -public class TraverseLoci extends TraversalEngine,LocusShardDataProvider> { +public abstract class TraverseLociBase extends TraversalEngine,LocusShardDataProvider> { /** * our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraversalEngine.class); @Override - protected String getTraversalType() { + protected final String getTraversalType() { return "sites"; } + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + protected abstract TraverseResults traverse( final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum); + @Override public T traverse( LocusWalker walker, LocusShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("TraverseLociBase.traverse: Shard is %s", dataProvider)); - LocusView locusView = getLocusView( walker, dataProvider ); - boolean done = false; + final LocusView locusView = getLocusView( walker, dataProvider ); if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); ReferenceOrderedView referenceOrderedDataView = null; if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) @@ -44,43 +56,23 @@ public class TraverseLoci extends TraversalEngine,Locu else referenceOrderedDataView = (RodLocusView)locusView; - LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - // We keep processing while the next reference location is within the interval - while( locusView.hasNext() && ! done ) { - AlignmentContext locus = locusView.next(); - GenomeLoc location = locus.getLocation(); - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(dataProvider.getShard(),locus.getLocation()); - } + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); } // We have a final map call to execute here to clean up the skipped based from the // last position in the ROD to that in the interval if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { // only do this if the walker isn't done! - RodLocusView rodLocusView = (RodLocusView)locusView; - long nSkipped = rodLocusView.getLastSkippedBases(); + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); if ( nSkipped > 0 ) { - GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - M x = walker.map(null, null, ac); + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); sum = walker.reduce(x, sum); } } @@ -90,14 +82,14 @@ public class TraverseLoci extends TraversalEngine,Locu /** * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * of sorts, providing a consistent interface so that TraverseLociBase doesn't need to be reimplemented for any new datatype * that comes along. * @param walker walker to interrogate. * @param dataProvider Data which which to drive the locus view. * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. */ private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); if( dataSource == DataSource.READS ) return new CoveredLocusView(dataProvider); else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java new file mode 100755 index 000000000..1dec3b238 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java @@ -0,0 +1,48 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociLinear extends TraverseLociBase { + + @Override + protected TraverseResults traverse(LocusWalker walker, LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView, T sum) { + // We keep processing while the next reference location is within the interval + boolean done = false; + int numIterations = 0; + + while( locusView.hasNext() && ! done ) { + numIterations++; + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + final boolean keepMeP = walker.filter(tracker, refContext, locus); + if (keepMeP) { + final M x = walker.map(tracker, refContext, locus); + sum = walker.reduce(x, sum); + done = walker.isDone(); + } + + // TODO -- refactor printProgress to separate updating read metrics from printing progress + //printProgress(dataProvider.getShard(),locus.getLocation()); + } + + return new TraverseResults(numIterations, sum); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java new file mode 100755 index 000000000..4e6eb1915 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -0,0 +1,200 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.LocusView; +import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; + +import java.util.Iterator; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociNano extends TraverseLociBase { + /** our log, which we want to capture anything from this class */ + private static final boolean DEBUG = false; + private static final int BUFFER_SIZE = 1000; + + final NanoScheduler nanoScheduler; + + public TraverseLociNano(int nThreads) { + nanoScheduler = new NanoScheduler(BUFFER_SIZE, nThreads); + } + + @Override + protected TraverseResults traverse(final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum) { + nanoScheduler.setDebug(DEBUG); + final TraverseLociMap myMap = new TraverseLociMap(walker); + final TraverseLociReduce myReduce = new TraverseLociReduce(walker); + + final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); + final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); + + // todo -- how do I print progress? +// final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; +// final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); +// printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); + + return new TraverseResults(inputIterator.numIterations, result); + } + + /** + * Create iterator that provides inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + */ + private class MapDataIterator implements Iterator { + final LocusView locusView; + final LocusReferenceView referenceView; + final ReferenceOrderedView referenceOrderedDataView; + int numIterations = 0; + + private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { + this.locusView = locusView; + this.referenceView = referenceView; + this.referenceOrderedDataView = referenceOrderedDataView; + } + + @Override + public boolean hasNext() { + return locusView.hasNext(); + } + + @Override + public MapData next() { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + //logger.info("Pulling data from MapDataIterator at " + location); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext); + + numIterations++; + return new MapData(locus, refContext, tracker); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); + } + } + + @Override + public void printOnTraversalDone() { + nanoScheduler.shutdown(); + super.printOnTraversalDone(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final AlignmentContext alignmentContext; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.alignmentContext = alignmentContext; + this.refContext = refContext; + this.tracker = tracker; + } + + @Override + public String toString() { + return "MapData " + alignmentContext.getLocation(); + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseLociMap implements MapFunction { + final LocusWalker walker; + + private TraverseLociMap(LocusWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); + if (keepMeP) { + final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); + return new MapResult(x); + } + } + return SKIP_REDUCE; + } + } + + /** + * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseLociReduce implements ReduceFunction { + final LocusWalker walker; + + private TraverseLociReduce(LocusWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } +} From 8cdeb51b78696340d9303d44342095bb82a40671 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 14:50:06 -0400 Subject: [PATCH 19/45] Cleanup printProgress in TraversalEngine -- Separate updating cumulative traversal metrics from printing progress. There's now an updateCumulativeMetrics function and a printProgress() that only takes a current position -- printProgress now soles relies on the time since the last progress to decide if it will print or not. No longer uses the number of cycles, since this isn't reliable in the case of nano scheduling -- GenomeAnalysisEngine now maintains a pointer to the master cumulative metrics. getCumulativeMetrics never returns null, which was handled in some parts of the code but not others. -- Update all of the traversals to use the new updateCumulativeMetrics, printProgress model -- Added progress callback to nano scheduler. Every bufferSize elements this callback is invoked, allowing us to smoothly update the progress meter in the NanoScheduler -- Rename MapFunction to NanoSchedulerMap and the same for reduce. --- .../sting/gatk/GenomeAnalysisEngine.java | 7 +- .../gatk/traversals/TraversalEngine.java | 152 ++++++++---------- .../traversals/TraverseActiveRegions.java | 3 +- .../gatk/traversals/TraverseDuplicates.java | 3 +- .../gatk/traversals/TraverseLociBase.java | 1 + .../gatk/traversals/TraverseLociLinear.java | 3 +- .../gatk/traversals/TraverseLociNano.java | 25 +-- .../gatk/traversals/TraverseReadPairs.java | 3 +- .../sting/gatk/traversals/TraverseReads.java | 7 +- .../gatk/traversals/TraverseReadsNano.java | 14 +- .../utils/nanoScheduler/NanoScheduler.java | 35 ++-- ...ion.java => NanoSchedulerMapFunction.java} | 2 +- .../NanoSchedulerProgressFunction.java | 12 ++ ....java => NanoSchedulerReduceFunction.java} | 2 +- .../nanoScheduler/NanoSchedulerUnitTest.java | 4 +- 15 files changed, 153 insertions(+), 120 deletions(-) rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{MapFunction.java => NanoSchedulerMapFunction.java} (84%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{ReduceFunction.java => NanoSchedulerReduceFunction.java} (87%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index b9b5e452d..1b4333ce2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -143,6 +143,8 @@ public class GenomeAnalysisEngine { */ private ThreadAllocation threadAllocation; + private ReadMetrics cumulativeMetrics = null; + /** * A currently hacky unique name for this GATK instance */ @@ -1035,7 +1037,10 @@ public class GenomeAnalysisEngine { * owned by the caller; the caller can do with the object what they wish. */ public ReadMetrics getCumulativeMetrics() { - return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); + // todo -- probably shouldn't be lazy + if ( cumulativeMetrics == null ) + cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics(); + return cumulativeMetrics; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 198f9342e..4422d49ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -44,24 +44,12 @@ import java.util.List; import java.util.Map; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static final Logger logger = Logger.getLogger(TraversalEngine.class); + // Time in milliseconds since we initialized this engine private static final int HISTORY_WINDOW_SIZE = 50; - private static class ProcessingHistory { - double elapsedSeconds; - long unitsProcessed; - long bpProcessed; - GenomeLoc loc; - - public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { - this.elapsedSeconds = elapsedSeconds; - this.loc = loc; - this.unitsProcessed = unitsProcessed; - this.bpProcessed = bpProcessed; - } - - } - /** lock object to sure updates to history are consistent across threads */ private static final Object lock = new Object(); LinkedList history = new LinkedList(); @@ -70,13 +58,12 @@ public abstract class TraversalEngine,Provide private SimpleTimer timer = null; // How long can we go without printing some progress info? - private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; - private int printProgressCheckCounter = 0; private long lastProgressPrintTime = -1; // When was the last time we printed progress log? - private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds - private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds - private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; - private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + + private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds + private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; + private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + private long progressPrintFrequency = 10 * 1000; // in milliseconds private boolean progressMeterInitialized = false; // for performance log @@ -85,15 +72,12 @@ public abstract class TraversalEngine,Provide private File performanceLogFile; private PrintStream performanceLog = null; private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log? - private final long PERFORMANCE_LOG_PRINT_FREQUENCY = PROGRESS_PRINT_FREQUENCY; // in milliseconds + private final long PERFORMANCE_LOG_PRINT_FREQUENCY = progressPrintFrequency; // in milliseconds /** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */ long targetSize = -1; GenomeLocSortedSet targetIntervals = null; - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - protected GenomeAnalysisEngine engine; // ---------------------------------------------------------------------------------------------------- @@ -187,28 +171,34 @@ public abstract class TraversalEngine,Provide } /** - * Forward request to printProgress + * Update the cumulative traversal metrics according to the data in this shard * - * Assumes that one cycle has been completed - * - * @param shard the given shard currently being processed. - * @param loc the location + * @param shard a non-null shard */ - public void printProgress(Shard shard, GenomeLoc loc) { - // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false, 1); + public void updateCumulativeMetrics(final Shard shard) { + updateCumulativeMetrics(shard.getReadMetrics()); + } + + /** + * Update the cumulative traversal metrics according to the data in this shard + * + * @param singleTraverseMetrics read metrics object containing the information about a single shard's worth + * of data processing + */ + public void updateCumulativeMetrics(final ReadMetrics singleTraverseMetrics) { + engine.getCumulativeMetrics().incrementMetrics(singleTraverseMetrics); } /** * Forward request to printProgress * - * @param shard the given shard currently being processed. + * Assumes that one cycle has been completed + * * @param loc the location - * @param nElapsedCycles the number of cycles (turns of map) that have occurred since the last call */ - public void printProgress(Shard shard, GenomeLoc loc, int nElapsedCycles) { + public void printProgress(final GenomeLoc loc) { // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false, nElapsedCycles); + printProgress(loc, false); } /** @@ -216,18 +206,9 @@ public abstract class TraversalEngine,Provide * every M seconds, for N and M set in global variables. * * @param loc Current location, can be null if you are at the end of the traversal - * @param metrics Data processed since the last cumulative * @param mustPrint If true, will print out info, regardless of nRecords or time interval */ - private synchronized void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint, int nElapsedCycles) { - final int previousPrintCycle = printProgressCheckCounter / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; - final int newPrintCycle = (printProgressCheckCounter+nElapsedCycles) / PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES; - - printProgressCheckCounter += nElapsedCycles; // keep track of our number of cycles through printProgress - if ( newPrintCycle == previousPrintCycle && ! mustPrint ) - // don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES - return; - + private synchronized void printProgress(final GenomeLoc loc, boolean mustPrint) { if( ! progressMeterInitialized ) { logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", @@ -236,37 +217,30 @@ public abstract class TraversalEngine,Provide } final long curTime = timer.currentTime(); - boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY); + boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); if ( printProgress || printLog ) { - // getting and appending metrics data actually turns out to be quite a heavyweight - // operation. Postpone it until after determining whether to print the log message. - ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics(); - if(metrics != null) - cumulativeMetrics.incrementMetrics(metrics); - - final long nRecords = cumulativeMetrics.getNumIterations(); - - ProcessingHistory last = updateHistory(loc,cumulativeMetrics); + final ProcessingHistory last = updateHistory(loc, engine.getCumulativeMetrics()); final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds); - final AutoFormattingTime bpRate = new AutoFormattingTime(secondsPerMillionBP(last)); - final AutoFormattingTime unitRate = new AutoFormattingTime(secondsPerMillionElements(last)); - final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last); + final AutoFormattingTime bpRate = new AutoFormattingTime(last.secondsPerMillionBP()); + final AutoFormattingTime unitRate = new AutoFormattingTime(last.secondsPerMillionElements()); + final double fractionGenomeTargetCompleted = last.calculateFractionGenomeTargetCompleted(targetSize); final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted); final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds()); + final long nRecords = engine.getCumulativeMetrics().getNumIterations(); if ( printProgress ) { lastProgressPrintTime = curTime; // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds + progressPrintFrequency = 60 * 1000; // in milliseconds else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds + progressPrintFrequency = 30 * 1000; // in milliseconds else - PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds + progressPrintFrequency = 10 * 1000; // in milliseconds final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : Integer.toString(loc.getStart()); logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", @@ -296,7 +270,7 @@ public abstract class TraversalEngine,Provide * @param metrics information about what's been processed already * @return */ - private final ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { + private ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { synchronized (lock) { if ( history.size() > HISTORY_WINDOW_SIZE ) history.pop(); @@ -309,26 +283,11 @@ public abstract class TraversalEngine,Provide } } - /** How long in seconds to process 1M traversal units? */ - private final double secondsPerMillionElements(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.unitsProcessed, 1); - } - - /** How long in seconds to process 1M bp on the genome? */ - private final double secondsPerMillionBP(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.bpProcessed, 1); - } - - /** What fractoin of the target intervals have we covered? */ - private final double calculateFractionGenomeTargetCompleted(ProcessingHistory last) { - return (1.0*last.bpProcessed) / targetSize; - } - /** * Called after a traversal to print out information about the traversal process */ public void printOnTraversalDone() { - printProgress(null, null, true, 1); + printProgress(null, true); final double elapsed = timer == null ? 0 : timer.getElapsedTime(); @@ -389,7 +348,7 @@ public abstract class TraversalEngine,Provide * @return Frequency, in seconds, of performance log writes. */ public long getPerformanceProgressPrintFrequencySeconds() { - return PROGRESS_PRINT_FREQUENCY; + return progressPrintFrequency; } /** @@ -397,6 +356,35 @@ public abstract class TraversalEngine,Provide * @param seconds number of seconds between messages indicating performance frequency. */ public void setPerformanceProgressPrintFrequencySeconds(long seconds) { - PROGRESS_PRINT_FREQUENCY = seconds; + progressPrintFrequency = seconds; + } + + private static class ProcessingHistory { + double elapsedSeconds; + long unitsProcessed; + long bpProcessed; + GenomeLoc loc; + + public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { + this.elapsedSeconds = elapsedSeconds; + this.loc = loc; + this.unitsProcessed = unitsProcessed; + this.bpProcessed = bpProcessed; + } + + /** How long in seconds to process 1M traversal units? */ + private double secondsPerMillionElements() { + return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); + } + + /** How long in seconds to process 1M bp on the genome? */ + private double secondsPerMillionBP() { + return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); + } + + /** What fractoin of the target intervals have we covered? */ + private double calculateFractionGenomeTargetCompleted(final long targetSize) { + return (1.0*bpProcessed) / targetSize; + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index ecaa15fe9..bbd9346b3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -104,7 +104,8 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); sum = result.reduceResult; dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); } // We have a final map call to execute here to clean up the skipped based from the diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java index 1dec3b238..22381092f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociLinear.java @@ -39,8 +39,7 @@ public class TraverseLociLinear extends TraverseLociBase { done = walker.isDone(); } - // TODO -- refactor printProgress to separate updating read metrics from printing progress - //printProgress(dataProvider.getShard(),locus.getLocation()); + printProgress(locus.getLocation()); } return new TraverseResults(numIterations, sum); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 4e6eb1915..73b73c002 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -8,9 +8,10 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerProgressFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import java.util.Iterator; @@ -26,6 +27,7 @@ public class TraverseLociNano extends TraverseLociBase { public TraverseLociNano(int nThreads) { nanoScheduler = new NanoScheduler(BUFFER_SIZE, nThreads); + nanoScheduler.setProgressFunction(new TraverseLociProgress()); } @Override @@ -41,11 +43,6 @@ public class TraverseLociNano extends TraverseLociBase { final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); - // todo -- how do I print progress? -// final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; -// final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); -// printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); - return new TraverseResults(inputIterator.numIterations, result); } @@ -156,7 +153,7 @@ public class TraverseLociNano extends TraverseLociBase { * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseLociMap implements MapFunction { + private class TraverseLociMap implements NanoSchedulerMapFunction { final LocusWalker walker; private TraverseLociMap(LocusWalker walker) { @@ -177,11 +174,11 @@ public class TraverseLociNano extends TraverseLociBase { } /** - * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseLociReduce implements ReduceFunction { + private class TraverseLociReduce implements NanoSchedulerReduceFunction { final LocusWalker walker; private TraverseLociReduce(LocusWalker walker) { @@ -197,4 +194,12 @@ public class TraverseLociNano extends TraverseLociBase { return sum; } } + + private class TraverseLociProgress implements NanoSchedulerProgressFunction { + @Override + public void progress(MapData lastProcessedMap) { + if (lastProcessedMap.alignmentContext != null) + printProgress(lastProcessedMap.alignmentContext.getLocation()); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index ebaac40af..9b076fce4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -65,7 +65,8 @@ public class TraverseReadPairs extends TraversalEngine extends TraversalEngine,Read sum = walker.reduce(x, sum); } - GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); - printProgress(dataProvider.getShard(),locus); + final GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); + done = walker.isDone(); } return sum; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 4bb700c37..5679747e1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -35,9 +35,9 @@ import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.nanoScheduler.MapFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.ReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.LinkedList; @@ -94,7 +94,9 @@ public class TraverseReadsNano extends TraversalEngine, final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); - printProgress(dataProvider.getShard(), locus, aggregatedInputs.size()); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); return result; } @@ -189,7 +191,7 @@ public class TraverseReadsNano extends TraversalEngine, * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseReadsMap implements MapFunction { + private class TraverseReadsMap implements NanoSchedulerMapFunction { final ReadWalker walker; private TraverseReadsMap(ReadWalker walker) { @@ -209,11 +211,11 @@ public class TraverseReadsNano extends TraversalEngine, } /** - * ReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseReadsReduce implements ReduceFunction { + private class TraverseReadsReduce implements NanoSchedulerReduceFunction { final ReadWalker walker; private TraverseReadsReduce(ReadWalker walker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index f0e77354f..f0c2a6723 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -54,6 +54,8 @@ public class NanoScheduler { boolean shutdown = false; boolean debug = false; + private NanoSchedulerProgressFunction progressFunction = null; + final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); final SimpleTimer inputTimer = new SimpleTimer("input"); final SimpleTimer mapTimer = new SimpleTimer("map"); @@ -148,6 +150,17 @@ public class NanoScheduler { this.debug = debug; } + /** + * Set the progress callback function to progressFunction + * + * The progress callback is invoked after each buffer size elements have been processed by map/reduce + * + * @param progressFunction a progress function to call, or null if you don't want any progress callback + */ + public void setProgressFunction(final NanoSchedulerProgressFunction progressFunction) { + this.progressFunction = progressFunction; + } + /** * Execute a map/reduce job with this nanoScheduler * @@ -168,9 +181,9 @@ public class NanoScheduler { * @return the last reduce value */ public ReduceType execute(final Iterator inputReader, - final MapFunction map, + final NanoSchedulerMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NanoSchedulerReduceFunction reduce) { if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); @@ -193,9 +206,9 @@ public class NanoScheduler { * @return the reduce result of this map/reduce job */ private ReduceType executeSingleThreaded(final Iterator inputReader, - final MapFunction map, + final NanoSchedulerMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NanoSchedulerReduceFunction reduce) { ReduceType sum = initialValue; while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); @@ -211,9 +224,9 @@ public class NanoScheduler { * @return the reduce result of this map/reduce job */ private ReduceType executeMultiThreaded(final Iterator inputReader, - final MapFunction map, + final NanoSchedulerMapFunction map, final ReduceType initialValue, - final ReduceFunction reduce) { + final NanoSchedulerReduceFunction reduce) { debugPrint("Executing nanoScheduler"); ReduceType sum = initialValue; while ( inputReader.hasNext() ) { @@ -228,6 +241,8 @@ public class NanoScheduler { // send off the reduce job, and block until we get at least one reduce result sum = reduceSerial(reduce, mapQueue, sum); debugPrint(" Done with cycle of map/reduce"); + + if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -239,7 +254,7 @@ public class NanoScheduler { } @Requires({"reduce != null", "! mapQueue.isEmpty()"}) - private ReduceType reduceSerial(final ReduceFunction reduce, + private ReduceType reduceSerial(final NanoSchedulerReduceFunction reduce, final Queue> mapQueue, final ReduceType initSum) throws InterruptedException, ExecutionException { @@ -280,7 +295,7 @@ public class NanoScheduler { } @Requires({"map != null", "! inputs.isEmpty()"}) - private Queue> submitMapJobs(final MapFunction map, + private Queue> submitMapJobs(final NanoSchedulerMapFunction map, final ExecutorService executor, final List inputs) { final Queue> mapQueue = new LinkedList>(); @@ -299,10 +314,10 @@ public class NanoScheduler { */ private class CallableMap implements Callable { final InputType input; - final MapFunction map; + final NanoSchedulerMapFunction map; @Requires({"map != null"}) - private CallableMap(final MapFunction map, final InputType inputs) { + private CallableMap(final NanoSchedulerMapFunction map, final InputType inputs) { this.input = inputs; this.map = map; } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java similarity index 84% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java index 440c263b7..ddf4421d2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java @@ -9,7 +9,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface MapFunction { +public interface NanoSchedulerMapFunction { /** * Return function on input, returning a value of ResultType * @param input diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java new file mode 100644 index 000000000..8631196a3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java @@ -0,0 +1,12 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 9/4/12 + * Time: 2:10 PM + * To change this template use File | Settings | File Templates. + */ +public interface NanoSchedulerProgressFunction { + public void progress(final InputType lastMapInput); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java index 8f1b0eddd..7e58eeaf9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReduceFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java @@ -7,7 +7,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface ReduceFunction { +public interface NanoSchedulerReduceFunction { /** * Combine one with sum into a new ReduceType * @param one the result of a map call on an input element diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 1dcc243f2..0ec3035e2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -21,11 +21,11 @@ import java.util.List; public class NanoSchedulerUnitTest extends BaseTest { public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; - private static class Map2x implements MapFunction { + private static class Map2x implements NanoSchedulerMapFunction { @Override public Integer apply(Integer input) { return input * 2; } } - private static class ReduceSum implements ReduceFunction { + private static class ReduceSum implements NanoSchedulerReduceFunction { int prevOne = Integer.MIN_VALUE; @Override public Integer apply(Integer one, Integer sum) { From 03dd470ec152c1bf7682ce3afde2141b151acf13 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 14:58:12 -0400 Subject: [PATCH 20/45] Test for progressFunction in NanoScheduler; bugfix for single threaded fast path --- .../utils/nanoScheduler/NanoScheduler.java | 3 +++ .../nanoScheduler/NanoSchedulerUnitTest.java | 22 ++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index f0c2a6723..61d4fdd01 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -210,9 +210,12 @@ public class NanoScheduler { final ReduceType initialValue, final NanoSchedulerReduceFunction reduce) { ReduceType sum = initialValue; + int i = 0; while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); final MapType mapValue = map.apply(input); + if ( i++ % bufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); sum = reduce.apply(mapValue, sum); } return sum; diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 0ec3035e2..3bd006ffe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -34,6 +34,16 @@ public class NanoSchedulerUnitTest extends BaseTest { } } + private static class ProgressCallback implements NanoSchedulerProgressFunction { + int callBacks = 0; + + @Override + public void progress(Integer lastMapInput) { + callBacks++; + } + } + + private static int sum2x(final int start, final int end) { int sum = 0; for ( int i = start; i < end; i++ ) @@ -62,6 +72,11 @@ public class NanoSchedulerUnitTest extends BaseTest { return ints.iterator(); } + public int nExpectedCallbacks() { + int nElements = Math.max(end - start, 0); + return nElements / bufferSize; + } + public Map2x makeMap() { return new Map2x(); } public Integer initReduce() { return 0; } public ReduceSum makeReduce() { return new ReduceSum(); } @@ -73,7 +88,7 @@ public class NanoSchedulerUnitTest extends BaseTest { for ( final int bufferSize : Arrays.asList(1, 10, 1000, 1000000) ) { for ( final int nt : Arrays.asList(1, 2, 4) ) { for ( final int start : Arrays.asList(0) ) { - for ( final int end : Arrays.asList(1, 2, 11, 10000, 100000) ) { + for ( final int end : Arrays.asList(0, 1, 2, 11, 10000, 100000) ) { exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end); } } @@ -101,12 +116,17 @@ public class NanoSchedulerUnitTest extends BaseTest { final NanoScheduler nanoScheduler = new NanoScheduler(test.bufferSize, test.nThreads); + final ProgressCallback callback = new ProgressCallback(); + nanoScheduler.setProgressFunction(callback); + Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); Assert.assertNotNull(sum); Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + + Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected"); nanoScheduler.shutdown(); } From a997c99806b49c1ca0efdd4cc9c834df465e7b22 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 17:54:43 -0400 Subject: [PATCH 21/45] Initial NanoScheduler with input producer thread --- .../utils/nanoScheduler/NanoScheduler.java | 109 ++++++++++++++---- .../nanoScheduler/NanoSchedulerUnitTest.java | 3 +- 2 files changed, 86 insertions(+), 26 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 61d4fdd01..4f9fedce3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -5,6 +5,7 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Iterator; @@ -74,7 +75,7 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads); + this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); // start timing the time spent outside of the nanoScheduler outsideSchedulerTimer.start(); @@ -232,20 +233,31 @@ public class NanoScheduler { final NanoSchedulerReduceFunction reduce) { debugPrint("Executing nanoScheduler"); ReduceType sum = initialValue; - while ( inputReader.hasNext() ) { + boolean done = false; + + final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); + final InputProducer inputProducer = new InputProducer(inputReader, inputQueue); + inputProducer.start(); + + while ( ! done ) { try { - // read in our input values - final List inputs = readInputs(inputReader); - debugPrint("Enqueuing " + inputs.size() + " elements to map"); + final Pair, Boolean> readResults = readInputs(inputQueue); + final List inputs = readResults.getFirst(); + done = readResults.getSecond(); - // send jobs for map - final Queue> mapQueue = submitMapJobs(map, executor, inputs); + if ( ! inputs.isEmpty() ) { + // send jobs for map + final Queue> mapQueue = submitMapJobs(map, executor, inputs); - // send off the reduce job, and block until we get at least one reduce result - sum = reduceSerial(reduce, mapQueue, sum); - debugPrint(" Done with cycle of map/reduce"); + // send off the reduce job, and block until we get at least one reduce result + sum = reduceSerial(reduce, mapQueue, sum); + debugPrint(" Done with cycle of map/reduce"); - if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); + if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); + } else { + // we must be done + if ( ! done ) throw new IllegalStateException("Inputs empty but not done"); + } } catch (InterruptedException ex) { throw new ReviewedStingException("got execution exception", ex); } catch (ExecutionException ex) { @@ -267,9 +279,9 @@ public class NanoScheduler { for ( final Future future : mapQueue ) { final MapType value = future.get(); // block until we get the values for this task - if ( TIME_CALLS) reduceTimer.restart(); + if ( TIME_CALLS ) reduceTimer.restart(); sum = reduce.apply(value, sum); - if ( TIME_CALLS) reduceTimer.stop(); + if ( TIME_CALLS ) reduceTimer.stop(); } return sum; @@ -280,21 +292,68 @@ public class NanoScheduler { * * @return a queue of input read in, containing one or more values of InputType read in */ - @Requires("inputReader.hasNext()") - @Ensures("!result.isEmpty()") - private List readInputs(final Iterator inputReader) { + @Requires("inputReader != null") + @Ensures("result != null") + private Pair, Boolean> readInputs(final BlockingQueue inputReader) throws InterruptedException { int n = 0; final List inputs = new LinkedList(); + boolean done = false; - if ( TIME_CALLS) inputTimer.restart(); - while ( inputReader.hasNext() && n < getBufferSize() ) { - final InputType input = inputReader.next(); - inputs.add(input); - n++; + while ( ! done && n < getBufferSize() ) { + final InputDatum input = inputReader.take(); + done = input.isLast(); + if ( ! done ) { + inputs.add(input.datum); + n++; + } } - if ( TIME_CALLS) inputTimer.stop(); - return inputs; + return new Pair, Boolean>(inputs, done); + } + + private class InputProducer extends Thread { + final Iterator inputReader; + final BlockingQueue outputQueue; + + public InputProducer(final Iterator inputReader, final BlockingQueue outputQueue) { + this.inputReader = inputReader; + this.outputQueue = outputQueue; + } + + public void run() { + try { + while ( inputReader.hasNext() ) { + if ( TIME_CALLS ) inputTimer.restart(); + final InputType input = inputReader.next(); + if ( TIME_CALLS ) inputTimer.stop(); + outputQueue.put(new InputDatum(input)); + } + + // add the EOF object so we know we are done + outputQueue.put(new InputDatum()); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + } + + private class InputDatum { + final boolean isLast; + final InputType datum; + + private InputDatum(final InputType datum) { + isLast = false; + this.datum = datum; + } + + private InputDatum() { + isLast = true; + this.datum = null; + } + + public boolean isLast() { + return isLast; + } } @Requires({"map != null", "! inputs.isEmpty()"}) @@ -326,10 +385,10 @@ public class NanoScheduler { } @Override public MapType call() throws Exception { - if ( TIME_CALLS) mapTimer.restart(); + if ( TIME_CALLS ) mapTimer.restart(); if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); - if ( TIME_CALLS) mapTimer.stop(); + if ( TIME_CALLS ) mapTimer.stop(); return result; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 3bd006ffe..ddfc3cecd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -126,7 +126,7 @@ public class NanoSchedulerUnitTest extends BaseTest { Assert.assertNotNull(sum); Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); - Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected"); + Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); nanoScheduler.shutdown(); } @@ -168,6 +168,7 @@ public class NanoSchedulerUnitTest extends BaseTest { final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); final NanoScheduler nanoScheduler = new NanoScheduler(test.bufferSize, test.nThreads); + nanoScheduler.setDebug(true); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); From c822b7c760245064741acf3c2221a299c89d21cb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 18:02:40 -0400 Subject: [PATCH 22/45] Fix long-standing NPE in LMS due to inappropriate timing of initialization --- .../sting/gatk/executive/LinearMicroScheduler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 5bcb16c94..740bcb566 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -60,11 +60,12 @@ public class LinearMicroScheduler extends MicroScheduler { boolean done = walker.isDone(); int counter = 0; + + traversalEngine.startTimersIfNecessary(); for (Shard shard : shardStrategy ) { if ( done || shard == null ) // we ran out of shards that aren't owned break; - traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); From 71d9ebcb0d8266152a142b5f9207eec022a7716f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 18:03:05 -0400 Subject: [PATCH 23/45] Fix bug (introduced by me) that didn't include contig in progress meter --- .../broadinstitute/sting/gatk/traversals/TraversalEngine.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 4422d49ae..8c617e4dc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -242,7 +242,7 @@ public abstract class TraversalEngine,Provide else progressPrintFrequency = 10 * 1000; // in milliseconds - final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : Integer.toString(loc.getStart()); + final String posName = loc == null ? (mustPrint ? "done" : "unmapped reads") : String.format("%s:%d", loc.getContig(), loc.getStart()); logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", posName, nRecords*1.0, elapsed, unitRate, 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); From 1e55475adcce7a9ec5ab8322fe5ed46efe111d1c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 18:07:08 -0400 Subject: [PATCH 24/45] NanoScheduler uses ExecutorService to run input reader thread --- .../utils/nanoScheduler/NanoScheduler.java | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 4f9fedce3..89e44ce93 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -51,7 +51,8 @@ public class NanoScheduler { final int bufferSize; final int nThreads; - final ExecutorService executor; + final ExecutorService inputExecutor; + final ExecutorService mapExecutor; boolean shutdown = false; boolean debug = false; @@ -75,7 +76,8 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; - this.executor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); + this.mapExecutor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); + this.inputExecutor = Executors.newSingleThreadExecutor(); // start timing the time spent outside of the nanoScheduler outsideSchedulerTimer.start(); @@ -107,10 +109,10 @@ public class NanoScheduler { public void shutdown() { outsideSchedulerTimer.stop(); - if ( executor != null ) { - final List remaining = executor.shutdownNow(); + if ( mapExecutor != null ) { + final List remaining = mapExecutor.shutdownNow(); if ( ! remaining.isEmpty() ) - throw new IllegalStateException("Remaining tasks found in the executor, unexpected behavior!"); + throw new IllegalStateException("Remaining tasks found in the mapExecutor, unexpected behavior!"); } shutdown = true; @@ -236,8 +238,8 @@ public class NanoScheduler { boolean done = false; final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); - final InputProducer inputProducer = new InputProducer(inputReader, inputQueue); - inputProducer.start(); + + inputExecutor.submit(new InputProducer(inputReader, inputQueue)); while ( ! done ) { try { @@ -247,7 +249,7 @@ public class NanoScheduler { if ( ! inputs.isEmpty() ) { // send jobs for map - final Queue> mapQueue = submitMapJobs(map, executor, inputs); + final Queue> mapQueue = submitMapJobs(map, mapExecutor, inputs); // send off the reduce job, and block until we get at least one reduce result sum = reduceSerial(reduce, mapQueue, sum); @@ -311,7 +313,7 @@ public class NanoScheduler { return new Pair, Boolean>(inputs, done); } - private class InputProducer extends Thread { + private class InputProducer implements Runnable { final Iterator inputReader; final BlockingQueue outputQueue; From 9bf1d138d9ba921312b49b00d1627f4feff62c2d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 15:41:52 -0400 Subject: [PATCH 25/45] New GATK argument interface for data and cpu threads -- Closes GSA-515 Nanoscheduler GSA-542 Good interface to nanoScheduler -- Old -nt means dataThreads -- New -cnt (--num_cpu_threads_per_data_thread) gives you n cpu threads for each data thread in the system -- Cleanup logic for handling data and cpu threading in HMS, LMS, and MS -- GATKRunReport reports the total number of threads in use by the GATK, not just the nt value -- Removed the io,cpu tags for nt. Stupid system if you ask me. Cleaned up the GenomeAnalysisEngine and ThreadAllocation handling to be totally straightforward now --- .../sting/gatk/GenomeAnalysisEngine.java | 32 ++++---- .../arguments/GATKArgumentCollection.java | 44 ++++++----- .../executive/HierarchicalMicroScheduler.java | 17 ++-- .../gatk/executive/LinearMicroScheduler.java | 9 +-- .../sting/gatk/executive/MicroScheduler.java | 40 ++++++---- .../io/stubs/VariantContextWriterStub.java | 4 +- .../sting/gatk/phonehome/GATKRunReport.java | 2 +- .../resourcemanagement/ThreadAllocation.java | 78 +++++++++++-------- 8 files changed, 123 insertions(+), 103 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 1b4333ce2..fa28b02cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -400,28 +400,22 @@ public class GenomeAnalysisEngine { * Parse out the thread allocation from the given command-line argument. */ private void determineThreadAllocation() { - Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); + if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads); + if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread); + if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads); - // TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters. - Integer numCPUThreads = null; - if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null) - throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); - else if(tags.containsKey("cpu")) - numCPUThreads = Integer.parseInt(tags.getValue("cpu")); - else if(argCollection.numberOfCPUThreads != null) - numCPUThreads = argCollection.numberOfCPUThreads; - - Integer numIOThreads = null; - if(tags.containsKey("io") && argCollection.numberOfIOThreads != null) - throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); - else if(tags.containsKey("io")) - numIOThreads = Integer.parseInt(tags.getValue("io")); - else if(argCollection.numberOfIOThreads != null) - numIOThreads = argCollection.numberOfIOThreads; - - this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads, numCPUThreads, numIOThreads, ! argCollection.disableEfficiencyMonitor); + this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads, + argCollection.numberOfCPUThreadsPerDataThread, + argCollection.numberOfIOThreads, + ! argCollection.disableEfficiencyMonitor); } + public int getTotalNumberOfThreads() { + return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads(); + } + + + /** * Allow subclasses and others within this package direct access to the walker manager. * @return The walker manager used by this package. diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 33400bd9e..b9e44d87b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -287,9 +287,32 @@ public class GATKArgumentCollection { @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - /** How many threads should be allocated to this analysis. */ - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) - public Integer numberOfThreads = 1; + // -------------------------------------------------------------------------------------------------------------- + // + // Multi-threading arguments + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * How many data threads should be allocated to this analysis? Data threads contains N cpu threads per + * data thread, and act as completely data parallel processing, increasing the memory usage of GATK + * by M data threads. Data threads generally scale extremely effectively, up to 24 cores + */ + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false) + public Integer numberOfDataThreads = 1; + + /** + * How many CPU threads should be allocated per data thread? Each CPU thread operates the map + * cycle independently, but may run into earlier scaling problems with IO than data threads. Has + * the benefit of not requiring X times as much memory per thread as data threads do, but rather + * only a constant overhead. + */ + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "cnt", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) + public int numberOfCPUThreadsPerDataThread = 1; + + @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) + @Hidden + public int numberOfIOThreads = 0; /** * By default the GATK monitors its own efficiency, but this can have a itsy-bitsy tiny @@ -299,24 +322,9 @@ public class GATKArgumentCollection { @Argument(fullName = "disableThreadEfficiencyMonitor", shortName = "dtem", doc = "Disable GATK efficiency monitoring", required = false) public Boolean disableEfficiencyMonitor = false; - /** - * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types. - * TODO: Kill this when I can do a tagged integer in Queue. - */ - @Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false) - @Hidden - public Integer numberOfCPUThreads = null; - @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) - @Hidden - public Integer numberOfIOThreads = null; - @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) public Integer numberOfBAMFileHandles = null; - @Argument(fullName="nanoThreads", shortName = "nanoThreads", doc="NanoThreading", required = false) - @Hidden - public int nanoThreads = 1; - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) public List readGroupBlackList = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 9198d210d..f1d2f7b5b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -76,21 +77,21 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar /** * Create a new hierarchical microscheduler to process the given reads and reference. * - * @param walker the walker used to process the dataset. - * @param reads Reads file(s) to process. - * @param reference Reference for driving the traversal. - * @param nThreadsToUse maximum number of threads to use to do the work + * @param walker the walker used to process the dataset. + * @param reads Reads file(s) to process. + * @param reference Reference for driving the traversal. + * @param threadAllocation How should we apply multi-threaded execution? */ protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int nThreadsToUse, - final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods, nThreadsToUse); + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); - if ( monitorThreadPerformance ) { + final int nThreadsToUse = threadAllocation.getNumDataThreads(); + if ( threadAllocation.monitorThreadEfficiency() ) { final EfficiencyMonitoringThreadFactory monitoringThreadFactory = new EfficiencyMonitoringThreadFactory(nThreadsToUse); setThreadEfficiencyMonitor(monitoringThreadFactory); this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, monitoringThreadFactory); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 740bcb566..ceb4a6f9b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.SampleUtils; @@ -39,13 +40,11 @@ public class LinearMicroScheduler extends MicroScheduler { final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int numThreads, // may be > 1 if are nanoScheduling - final boolean monitorThreadPerformance ) { - super(engine, walker, reads, reference, rods, numThreads); + final ThreadAllocation threadAllocation) { + super(engine, walker, reads, reference, rods, threadAllocation); - if ( monitorThreadPerformance ) + if ( threadAllocation.monitorThreadEfficiency() ) setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor()); - } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 073a46ee3..bc0d5da96 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -100,27 +100,30 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if (threadAllocation.getNumCPUThreads() > 1) { + if ( threadAllocation.isRunningInParallelMode() ) + logger.info(String.format("Running the GATK in parallel mode with %d CPU threads for each of %d data threads", + threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); + + if ( threadAllocation.getNumDataThreads() > 1 ) { if (walker.isReduceByInterval()) throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval. Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); - logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads())); - - if ( walker instanceof ReadWalker ) { - if ( ! (walker instanceof ThreadSafeMapReduce) ) badNT(engine, walker); - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + if ( ! (walker instanceof TreeReducible) ) { + throw badNT("nt", engine, walker); } else { - // TODO -- update test for when nano scheduling only is an option - if ( ! (walker instanceof TreeReducible) ) badNT(engine, walker); - return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } else { - return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads(), threadAllocation.monitorThreadEfficiency()); + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof ThreadSafeMapReduce) ) + throw badNT("cnt", engine, walker); + return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } - private static void badNT(final GenomeAnalysisEngine engine, final Walker walker) { - throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution. Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass()))); + private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) { + throw new UserException.BadArgumentValue("nt", + String.format("The analysis %s currently does not support parallel execution with %s. " + + "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg)); } /** @@ -130,24 +133,27 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @param reads The reads. * @param reference The reference. * @param rods the rods to include in the traversal - * @param numThreads the number of threads we are using in the underlying traversal + * @param threadAllocation the allocation of threads to use in the underlying traversal */ protected MicroScheduler(final GenomeAnalysisEngine engine, final Walker walker, final SAMDataSource reads, final IndexedFastaSequenceFile reference, final Collection rods, - final int numThreads) { + final ThreadAllocation threadAllocation) { this.engine = engine; this.reads = reads; this.reference = reference; this.rods = rods; if (walker instanceof ReadWalker) { - traversalEngine = numThreads > 1 ? new TraverseReadsNano(numThreads) : new TraverseReads(); + traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + ? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()) + : new TraverseReads(); } else if (walker instanceof LocusWalker) { - // TODO -- refactor to use better interface - traversalEngine = engine.getArguments().nanoThreads > 1 ? new TraverseLociNano(engine.getArguments().nanoThreads) : new TraverseLociLinear(); + traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + ? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()) + : new TraverseLociLinear(); } else if (walker instanceof DuplicateWalker) { traversalEngine = new TraverseDuplicates(); } else if (walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java index 260a7efda..ee1dc63e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java @@ -32,9 +32,9 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.writer.Options; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; import java.io.File; @@ -269,7 +269,7 @@ public class VariantContextWriterStub implements Stub, Var * @return */ public boolean alsoWriteBCFForTest() { - return engine.getArguments().numberOfThreads == 1 && // only works single threaded + return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded ! isCompressed() && // for non-compressed outputs getFile() != null && // that are going to disk engine.getArguments().generateShadowBCF; // and we actually want to do it diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 6f3f175a2..51fed470f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -218,7 +218,7 @@ public class GATKRunReport { // if there was an exception, capture it this.mException = e == null ? null : new ExceptionToXML(e); - numThreads = engine.getArguments().numberOfThreads; + numThreads = engine.getTotalNumberOfThreads(); percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU); percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING); percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING); diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index caae55ac5..f958c9db8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.gatk.resourcemanagement; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; /** * Models how threads are distributed between various components of the GATK. @@ -33,7 +33,12 @@ public class ThreadAllocation { /** * The number of CPU threads to be used by the GATK. */ - private final int numCPUThreads; + private final int numDataThreads; + + /** + * The number of CPU threads per data thread for GATK processing + */ + private final int numCPUThreadsPerDataThread; /** * Number of threads to devote exclusively to IO. Default is 0. @@ -45,8 +50,12 @@ public class ThreadAllocation { */ private final boolean monitorEfficiency; - public int getNumCPUThreads() { - return numCPUThreads; + public int getNumDataThreads() { + return numDataThreads; + } + + public int getNumCPUThreadsPerDataThread() { + return numCPUThreadsPerDataThread; } public int getNumIOThreads() { @@ -57,47 +66,50 @@ public class ThreadAllocation { return monitorEfficiency; } + /** + * Are we running in parallel mode? + * + * @return true if any parallel processing is enabled + */ + public boolean isRunningInParallelMode() { + return getTotalNumThreads() > 1; + } + + /** + * What is the total number of threads in use by the GATK? + * + * @return the sum of all thread allocations in this object + */ + public int getTotalNumThreads() { + return getNumDataThreads() + getNumCPUThreadsPerDataThread() + getNumIOThreads(); + } + /** * Construct the default thread allocation. */ public ThreadAllocation() { - this(1, null, null, false); + this(1, 1, 0, false); } /** * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). - * @param totalThreads Complete number of threads to allocate. - * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numDataThreads Total number of threads allocated to the traversal. + * @param numCPUThreadsPerDataThread The number of CPU threads per data thread to allocate * @param numIOThreads Total number of threads allocated exclusively to IO. + * @param monitorEfficiency should we monitor threading efficiency in the GATK? */ - public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads, final boolean monitorEfficiency) { - // If no allocation information is present, allocate all threads to CPU - if(numCPUThreads == null && numIOThreads == null) { - this.numCPUThreads = totalThreads; - this.numIOThreads = 0; - } - // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). - else if(numIOThreads == null) { - if(numCPUThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = totalThreads - numCPUThreads; - } - // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). - else if(numCPUThreads == null) { - if(numIOThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); - this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); - this.numIOThreads = numIOThreads; - } - else { - if(numCPUThreads + numIOThreads != totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = numIOThreads; - } + public ThreadAllocation(final int numDataThreads, + final int numCPUThreadsPerDataThread, + final int numIOThreads, + final boolean monitorEfficiency) { + if ( numDataThreads < 1 ) throw new ReviewedStingException("numDataThreads cannot be less than 1, but saw " + numDataThreads); + if ( numCPUThreadsPerDataThread < 1 ) throw new ReviewedStingException("numCPUThreadsPerDataThread cannot be less than 1, but saw " + numCPUThreadsPerDataThread); + if ( numIOThreads < 0 ) throw new ReviewedStingException("numIOThreads cannot be less than 0, but saw " + numIOThreads); + this.numDataThreads = numDataThreads; + this.numCPUThreadsPerDataThread = numCPUThreadsPerDataThread; + this.numIOThreads = numIOThreads; this.monitorEfficiency = monitorEfficiency; } } From 225f3a0ebe380ce8283c4adc4e46fcfa91b2155b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 16:35:00 -0400 Subject: [PATCH 26/45] Update integration test system to allow us to differentiate between testing data and cpu parallelism --- .../org/broadinstitute/sting/WalkerTest.java | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 7e38c00f3..660259ca8 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -40,13 +40,13 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.VariantContextTestProvider; - -import java.io.*; - import org.testng.Assert; import org.testng.annotations.AfterSuite; import org.testng.annotations.BeforeMethod; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; import java.text.SimpleDateFormat; import java.util.*; @@ -251,20 +251,43 @@ public class WalkerTest extends BaseTest { return false; } - protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { - return executeTest(name, spec, Arrays.asList(1, 4)); + public enum ParallelTestType { + TREE_REDUCIBLE, + NANO_SCHEDULED, + BOTH } - protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List parallelThreads) { + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) { + final List ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + final List cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + + return executeTest(name, spec, ntThreads, cntThreads); + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { + return executeTestParallel(name, spec, ParallelTestType.BOTH); + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { String originalArgs = spec.args; Pair, List> results = null; - for ( int nt : parallelThreads ) { + boolean ran1 = false; + for ( int nt : ntThreads ) { String extra = nt == 1 ? "" : (" -nt " + nt); + ran1 = ran1 || nt == 1; spec.args = originalArgs + extra; results = executeTest(name + "-nt-" + nt, spec); } + for ( int cnt : cpuThreads ) { + if ( cnt != 1 ) { + String extra = " -cnt " + cnt; + spec.args = originalArgs + extra; + results = executeTest(name + "-cnt-" + cnt, spec); + } + } + return results; } From dddf148a595af445d3a9e6ab66bf20a6d8dc3e93 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 16:35:32 -0400 Subject: [PATCH 27/45] Fixed bug in ThreadAllocation getTotalNumberOfThreads -- It isnt data + cpu its data * cpu threads. --- .../sting/gatk/resourcemanagement/ThreadAllocation.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index f958c9db8..c86f06c25 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -81,7 +81,7 @@ public class ThreadAllocation { * @return the sum of all thread allocations in this object */ public int getTotalNumThreads() { - return getNumDataThreads() + getNumCPUThreadsPerDataThread() + getNumIOThreads(); + return getNumDataThreads() * getNumCPUThreadsPerDataThread() + getNumIOThreads(); } /** From c5f1ceaa95d17b9aedd9b2e9a33d7d516fee95b8 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 16:38:21 -0400 Subject: [PATCH 28/45] All read and loci traversals go through NanoScheduler now -- The NanoScheduler is doing a good job at tracking important information like time spent in map/reduce/input etc. -- Can be disabled with static boolean in MicroScheduler if we have problems -- See GSA-515 Nanoscheduler GSA-549 Retire TraverseReads and TraverseLoci after testing confirms nano scheduler version in single threaded version is fine --- .../sting/gatk/executive/MicroScheduler.java | 8 +++++--- .../utils/nanoScheduler/NanoScheduler.java | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index bc0d5da96..490f44470 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -59,6 +59,8 @@ import java.util.Collection; /** Shards and schedules data in manageable chunks. */ public abstract class MicroScheduler implements MicroSchedulerMBean { + // TODO -- remove me and retire non nano scheduled versions of traversals + private final static boolean USE_NANOSCHEDULER_FOR_EVERYTHING = true; protected static final Logger logger = Logger.getLogger(MicroScheduler.class); /** @@ -101,7 +103,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { if ( threadAllocation.isRunningInParallelMode() ) - logger.info(String.format("Running the GATK in parallel mode with %d CPU threads for each of %d data threads", + logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)", threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); if ( threadAllocation.getNumDataThreads() > 1 ) { @@ -147,11 +149,11 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { this.rods = rods; if (walker instanceof ReadWalker) { - traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ? new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread()) : new TraverseReads(); } else if (walker instanceof LocusWalker) { - traversalEngine = threadAllocation.getNumCPUThreadsPerDataThread() > 1 + traversalEngine = USE_NANOSCHEDULER_FOR_EVERYTHING || threadAllocation.getNumCPUThreadsPerDataThread() > 1 ? new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread()) : new TraverseLociLinear(); } else if (walker instanceof DuplicateWalker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 89e44ce93..ade6dcaf5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -193,6 +193,7 @@ public class NanoScheduler { if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); outsideSchedulerTimer.stop(); + ReduceType result; if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { result = executeSingleThreaded(inputReader, map, initialValue, reduce); @@ -214,13 +215,29 @@ public class NanoScheduler { final NanoSchedulerReduceFunction reduce) { ReduceType sum = initialValue; int i = 0; + + // start timer to ensure that both hasNext and next are caught by the timer + if ( TIME_CALLS ) inputTimer.restart(); while ( inputReader.hasNext() ) { final InputType input = inputReader.next(); + if ( TIME_CALLS ) inputTimer.stop(); + + // map + if ( TIME_CALLS ) mapTimer.restart(); final MapType mapValue = map.apply(input); + if ( TIME_CALLS ) mapTimer.stop(); + if ( i++ % bufferSize == 0 && progressFunction != null ) progressFunction.progress(input); + + // reduce + if ( TIME_CALLS ) reduceTimer.restart(); sum = reduce.apply(mapValue, sum); + if ( TIME_CALLS ) reduceTimer.stop(); + + if ( TIME_CALLS ) inputTimer.restart(); } + return sum; } From e11915aa0aa901d44e73a7d44c2ba1b707e42d21 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 17:37:56 -0400 Subject: [PATCH 29/45] GSA-515 Nanoscheduler GSA-550 ThreadSafeMapReduce shouldn't be super interface of TreeReducible --- .../broadinstitute/sting/gatk/executive/MicroScheduler.java | 2 +- .../src/org/broadinstitute/sting/gatk/walkers/FlagStat.java | 2 +- .../{ThreadSafeMapReduce.java => NanoSchedulable.java} | 2 +- .../src/org/broadinstitute/sting/gatk/walkers/Pileup.java | 2 +- .../src/org/broadinstitute/sting/gatk/walkers/PrintReads.java | 2 +- .../org/broadinstitute/sting/gatk/walkers/TreeReducible.java | 2 +- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 2 +- .../sting/gatk/walkers/genotyper/UnifiedGenotyper.java | 2 +- .../org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java | 3 ++- .../org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java | 3 ++- .../org/broadinstitute/sting/gatk/walkers/qc/CountReads.java | 4 ++-- 11 files changed, 14 insertions(+), 12 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/walkers/{ThreadSafeMapReduce.java => NanoSchedulable.java} (97%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 490f44470..1da712e8a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -116,7 +116,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } else { - if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof ThreadSafeMapReduce) ) + if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) throw badNT("cnt", engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java index 14d14aca5..b4ef66aaf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FlagStat.java @@ -45,7 +45,7 @@ import java.text.NumberFormat; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS}) -public class FlagStat extends ReadWalker implements ThreadSafeMapReduce { +public class FlagStat extends ReadWalker implements NanoSchedulable { @Output PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java index 1ce469f8c..731ce7e4e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ThreadSafeMapReduce.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java @@ -27,5 +27,5 @@ package org.broadinstitute.sting.gatk.walkers; * declare that their map function is thread-safe and so multiple * map calls can be run in parallel in the same JVM instance. */ -public interface ThreadSafeMapReduce { +public interface NanoSchedulable { } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 607c83966..a3efea9f1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -52,7 +52,7 @@ import java.util.List; * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible { +public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index 4118617fc..37176cbf9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -93,7 +93,7 @@ import java.util.*; @ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReads extends ReadWalker implements ThreadSafeMapReduce { +public class PrintReads extends ReadWalker implements NanoSchedulable { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java index 8621c0e9d..c950e07e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/TreeReducible.java @@ -13,7 +13,7 @@ package org.broadinstitute.sting.gatk.walkers; * shards of the data can reduce with each other, and the composite result * can be reduced with other composite results. */ -public interface TreeReducible extends ThreadSafeMapReduce { +public interface TreeReducible { /** * A composite, 'reduce of reduces' function. * @param lhs 'left-most' portion of data in the composite reduce. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 443b493be..43aa85a05 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -109,7 +109,7 @@ import java.util.ArrayList; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file @Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality @PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta -public class BaseRecalibrator extends LocusWalker implements TreeReducible { +public class BaseRecalibrator extends LocusWalker implements TreeReducible, NanoSchedulable { @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 93928a780..32ceff715 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -125,7 +125,7 @@ import java.util.*; // TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: // TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible { +public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible, NanoSchedulable { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java index bd10eab87..cd295f26e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -40,7 +41,7 @@ import java.io.PrintStream; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountLoci extends LocusWalker implements TreeReducible { +public class CountLoci extends LocusWalker implements TreeReducible, NanoSchedulable { @Output(doc="Write count to this file instead of STDOUT") PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index 9915d617e..ab37a2322 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; @@ -73,7 +74,7 @@ import java.util.*; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>> { +public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>>, NanoSchedulable { @Output public PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 856ea77f5..301fa5b9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -4,9 +4,9 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.ThreadSafeMapReduce; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker implements ThreadSafeMapReduce { +public class CountReads extends ReadWalker implements NanoSchedulable { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } From 574a8f710b5b193f7a2d4299b5f9222605aa6ff7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 17:40:02 -0400 Subject: [PATCH 30/45] Add static boolean controlled output of individual map call timing to nanoSecond resolution --- .../sting/utils/nanoScheduler/NanoScheduler.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index ade6dcaf5..24db0f7dc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -47,6 +47,7 @@ import java.util.concurrent.*; public class NanoScheduler { private final static Logger logger = Logger.getLogger(NanoScheduler.class); private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; + private final static boolean LOG_MAP_TIMES = false; private final static boolean TIME_CALLS = true; final int bufferSize; @@ -224,7 +225,9 @@ public class NanoScheduler { // map if ( TIME_CALLS ) mapTimer.restart(); + final long preMapTime = LOG_MAP_TIMES ? 0 : mapTimer.currentTimeNano(); final MapType mapValue = map.apply(input); + if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); if ( TIME_CALLS ) mapTimer.stop(); if ( i++ % bufferSize == 0 && progressFunction != null ) From 228bac75e48b390e886d9fdbf222978aaad1fc2b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 20:57:49 -0400 Subject: [PATCH 31/45] By default do only NT tests in integration tests --- public/java/test/org/broadinstitute/sting/WalkerTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 660259ca8..bcfd00aed 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -265,7 +265,7 @@ public class WalkerTest extends BaseTest { } protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { - return executeTestParallel(name, spec, ParallelTestType.BOTH); + return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE); } protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { From 0bd2a872faf1d71911a85c9748e3cfcc426bf6df Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Sep 2012 07:26:01 -0400 Subject: [PATCH 32/45] Done GSA-282: Unindexed traversals crash if a read goes off the end of a contig -- Already fixed in the codebase. Added unindexed bam and integration tests to ensure this is fine going forward. --- .../walkers/PileupWalkerIntegrationTest.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 667b325ed..e16ef3125 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -16,13 +16,27 @@ public class PileupWalkerIntegrationTest extends WalkerTest { executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + + private final static String SingleReadAligningOffChromosome1MD5 = "4a45fe1f85aaa8c4158782f2b6dee2bd"; @Test public void testSingleReadAligningOffChromosome1() { String gatk_args = "-T Pileup " + " -I " + privateTestDir + "readOffb37contig1.bam" + " -R " + b37KGReference + " -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList("4a45fe1f85aaa8c4158782f2b6dee2bd")); + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); executeTest("Testing single read spanning off chromosome 1", spec); } + + @Test + public void testSingleReadAligningOffChromosome1NoIndex() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.noIndex.bam" + + " -R " + b37KGReference + + " -U ALLOW_UNINDEXED_BAM" + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); + executeTest("Testing single read spanning off chromosome 1 unindexed", spec); + } } From 1b064805ed31c6532abf7d55d2e641388aad42c0 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 21:13:19 -0400 Subject: [PATCH 33/45] Renaming -cnt to -nct for consistency --- .../sting/gatk/arguments/GATKArgumentCollection.java | 2 +- .../sting/gatk/executive/MicroScheduler.java | 2 +- public/java/test/org/broadinstitute/sting/WalkerTest.java | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b9e44d87b..b8a7334b3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -307,7 +307,7 @@ public class GATKArgumentCollection { * the benefit of not requiring X times as much memory per thread as data threads do, but rather * only a constant overhead. */ - @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "cnt", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) + @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false) public int numberOfCPUThreadsPerDataThread = 1; @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 1da712e8a..46d6b5882 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -117,7 +117,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } } else { if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) - throw badNT("cnt", engine, walker); + throw badNT("nct", engine, walker); return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation); } } diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index bcfd00aed..fa9f9e8a7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -280,11 +280,11 @@ public class WalkerTest extends BaseTest { results = executeTest(name + "-nt-" + nt, spec); } - for ( int cnt : cpuThreads ) { - if ( cnt != 1 ) { - String extra = " -cnt " + cnt; + for ( int nct : cpuThreads ) { + if ( nct != 1 ) { + String extra = " -nct " + nct; spec.args = originalArgs + extra; - results = executeTest(name + "-cnt-" + cnt, spec); + results = executeTest(name + "-cnt-" + nct, spec); } } From 5ab5d8dee8754f0a8b545971b48644226934017a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 5 Sep 2012 22:08:34 -0400 Subject: [PATCH 34/45] Give EfficiencyMonitoringThreadFactoryUnitTest longer to complete its tests --- .../EfficiencyMonitoringThreadFactoryUnitTest.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 6544b9845..d8da274ce 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -34,14 +34,17 @@ import org.testng.annotations.Test; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.concurrent.*; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; /** * Tests for the state monitoring thread factory. */ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 1000; + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 10000; private final static int MAX_THREADS = 4; final static Object GLOBAL_LOCK = new Object(); From 6df6c1abd538f5616fb624236e0f9cd36a0871ea Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Sep 2012 13:14:18 -0400 Subject: [PATCH 36/45] Fix for PBT to stop NPE when there are no likelihoods present --- .../sting/gatk/walkers/phasing/PhaseByTransmission.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index bbd4bf92f..00acf854a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -541,7 +541,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Get a Map of genotype likelihoods. //In case of null, unavailable or no call, all likelihoods are 1/3. private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isCalled()){ + if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ EnumMap likelihoods = new EnumMap(GenotypeType.class); likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); likelihoods.put(GenotypeType.HET,1.0/3.0); From cb84a6473f19597d6ab220915fdd102002c0f352 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Thu, 24 May 2012 09:17:11 -0400 Subject: [PATCH 37/45] Downsampling: experimental engine integration -Off by default; engine fork isolates new code paths from old code paths, so no integration tests change yet -Experimental implementation is currently BROKEN due to a serious issue involving file spans. No one can/should use the experimental features until I've patched this issue. -There are temporarily two independent versions of LocusIteratorByState. Anyone changing one version should port the change to the other (if possible), and anyone adding unit tests for one version should add the same unit tests for the other (again, if possible). This situation will hopefully be extremely temporary, and last only until the experimental implementation is proven. --- .../reducereads/SlidingWindow.java | 2 +- .../sting/gatk/DownsamplingMethod.java | 52 -- .../sting/gatk/GenomeAnalysisEngine.java | 29 +- .../sting/gatk/ReadProperties.java | 1 + .../sting/gatk/WalkerManager.java | 12 +- .../arguments/GATKArgumentCollection.java | 40 +- .../gatk/datasources/providers/LocusView.java | 9 +- .../gatk/datasources/reads/SAMDataSource.java | 90 ++- .../{ => downsampling}/DownsampleType.java | 2 +- .../sting/gatk/downsampling/Downsampler.java | 73 +- .../gatk/downsampling/DownsamplingMethod.java | 153 +++++ .../DownsamplingReadsIterator.java | 47 +- .../downsampling/FractionalDownsampler.java | 42 +- .../FractionalDownsamplerFactory.java | 45 ++ .../downsampling/LevelingDownsampler.java | 212 ++++++ .../PerSampleDownsamplingReadsIterator.java | 202 ++++++ .../downsampling/PositionalDownsampler.java | 259 ------- .../gatk/downsampling/ReadsDownsampler.java | 17 +- .../downsampling/ReadsDownsamplerFactory.java | 37 + .../downsampling/ReservoirDownsampler.java | 37 +- .../ReservoirDownsamplerFactory.java | 45 ++ .../SimplePositionalDownsampler.java | 169 +++++ .../SimplePositionalDownsamplerFactory.java | 45 ++ .../sting/gatk/executive/WindowMaker.java | 9 +- ...tor.java => LegacyDownsampleIterator.java} | 4 +- .../gatk/iterators/LocusIteratorByState.java | 4 +- .../LocusIteratorByStateExperimental.java | 649 ++++++++++++++++++ .../gatk/iterators/VerifyingSamIterator.java | 4 +- .../sting/gatk/walkers/Downsample.java | 2 +- .../walkers/coverage/DepthOfCoverage.java | 2 +- .../walkers/genotyper/UnifiedGenotyper.java | 2 +- .../broadinstitute/sting/utils/MathUtils.java | 12 +- .../pileup/AbstractReadBackedPileup.java | 2 + .../sam/ArtificialMultiSampleReadStream.java | 86 +++ .../utils/sam/ArtificialSAMFileReader.java | 27 + .../sting/utils/sam/ArtificialSAMUtils.java | 24 + .../sam/ArtificialSingleSampleReadStream.java | 212 ++++++ ...ificialSingleSampleReadStreamAnalyzer.java | 281 ++++++++ .../reads/DownsamplerBenchmark.java | 5 +- .../reads/SAMDataSourceUnitTest.java | 171 ++++- .../DownsamplingReadsIteratorUnitTest.java | 161 +++-- .../FractionalDownsamplerUnitTest.java | 178 +++-- .../LevelingDownsamplerUnitTest.java | 163 +++++ ...mpleDownsamplingReadsIteratorUnitTest.java | 298 ++++++++ .../PositionalDownsamplerUnitTest.java | 357 ---------- ...ificialSingleSampleReadStreamAnalyzer.java | 126 ++++ .../ReservoirDownsamplerUnitTest.java | 129 ++++ .../SimplePositionalDownsamplerUnitTest.java | 330 +++++++++ ...usIteratorByStateExperimentalUnitTest.java | 546 +++++++++++++++ .../VerifyingSamIteratorUnitTest.java | 13 +- ...> LegacyReservoirDownsamplerUnitTest.java} | 2 +- ...ificialSingleSampleReadStreamUnitTest.java | 161 +++++ 52 files changed, 4701 insertions(+), 879 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java rename public/java/src/org/broadinstitute/sting/gatk/{ => downsampling}/DownsampleType.java (75%) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java rename public/java/src/org/broadinstitute/sting/gatk/iterators/{DownsampleIterator.java => LegacyDownsampleIterator.java} (88%) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java delete mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java rename public/java/test/org/broadinstitute/sting/utils/{ReservoirDownsamplerUnitTest.java => LegacyReservoirDownsamplerUnitTest.java} (99%) create mode 100644 public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index bdb9ef843..d2fc08c62 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -546,7 +546,7 @@ public class SlidingWindow { FractionalDownsampler downsampler = new FractionalDownsampler(fraction); downsampler.submit(allReads); - return downsampler.consumeDownsampledItems(); + return downsampler.consumeFinalizedItems(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java deleted file mode 100644 index 6d9e79156..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.broadinstitute.sting.gatk; - -import org.broadinstitute.sting.utils.exceptions.UserException; - -/** - * Describes the method for downsampling reads at a given locus. - * - * @author hanna - * @version 0.1 - */ - -public class DownsamplingMethod { - /** - * Type of downsampling to perform. - */ - public final DownsampleType type; - - /** - * Actual downsampling target is specified as an integer number of reads. - */ - public final Integer toCoverage; - - /** - * Actual downsampling target is specified as a fraction of total available reads. - */ - public final Double toFraction; - - /** - * Expresses no downsampling applied at all. - */ - public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null); - - public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) { - // Do some basic sanity checks on the downsampling parameters passed in. - - // Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator. - if(type != DownsampleType.NONE && toFraction == null && toCoverage == null) - throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); - - // Fraction and coverage cannot both be specified. - if(toFraction != null && toCoverage != null) - throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); - - // Experimental by sample downsampling does not work with a fraction of reads. - if(type == DownsampleType.BY_SAMPLE && toFraction != null) - throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method"); - - this.type = type; - this.toCoverage = toCoverage; - this.toFraction = toFraction; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index fa28b02cd..3ce8a92b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; @@ -441,14 +442,18 @@ public class GenomeAnalysisEngine { protected DownsamplingMethod getDownsamplingMethod() { GATKArgumentCollection argCollection = this.getArguments(); - DownsamplingMethod method; - if(argCollection.getDownsamplingMethod() != null) - method = argCollection.getDownsamplingMethod(); - else if(WalkerManager.getDownsamplingMethod(walker) != null) - method = WalkerManager.getDownsamplingMethod(walker); - else - method = GATKArgumentCollection.getDefaultDownsamplingMethod(); - return method; + boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling; + + // until the file pointer bug with the experimental downsamplers is fixed, disallow running with experimental downsampling + if ( useExperimentalDownsampling ) { + throw new UserException("The experimental downsampling implementation is currently crippled by a file-pointer-related bug. Until this bug is fixed, it's not safe (or possible) for anyone to use the experimental implementation!"); + } + + DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod(); + DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling); + DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling); + + return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod); } protected void setDownsamplingMethod(DownsamplingMethod method) { @@ -821,11 +826,13 @@ public class GenomeAnalysisEngine { * @return A data source for the given set of reads. */ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { - DownsamplingMethod method = getDownsamplingMethod(); + DownsamplingMethod downsamplingMethod = getDownsamplingMethod(); // Synchronize the method back into the collection so that it shows up when // interrogating for the downsample method during command line recreation. - setDownsamplingMethod(method); + setDownsamplingMethod(downsamplingMethod); + + logger.info(downsamplingMethod); if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); @@ -843,7 +850,7 @@ public class GenomeAnalysisEngine { argCollection.useOriginalBaseQualities, argCollection.strictnessLevel, argCollection.readBufferSize, - method, + downsamplingMethod, new ValidationExclusion(Arrays.asList(argCollection.unsafe)), filters, readTransformers, diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java index b2d4d202d..e1ada93cc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java @@ -4,6 +4,7 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java index ae59ce438..fbacbddc4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java @@ -27,6 +27,8 @@ package org.broadinstitute.sting.gatk; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -304,9 +306,10 @@ public class WalkerManager extends PluginManager { * downsampling method is specified on the command-line, the command-line version will * be used instead. * @param walkerClass The class of the walker to interrogate. + * @param useExperimentalDownsampling If true, use the experimental downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Class walkerClass) { + public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) { DownsamplingMethod downsamplingMethod = null; if( walkerClass.isAnnotationPresent(Downsample.class) ) { @@ -314,7 +317,7 @@ public class WalkerManager extends PluginManager { DownsampleType type = downsampleParameters.by(); Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; - downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction); + downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling); } return downsamplingMethod; @@ -333,10 +336,11 @@ public class WalkerManager extends PluginManager { * downsampling method is specified on the command-line, the command-line version will * be used instead. * @param walker The walker to interrogate. + * @param useExperimentalDownsampling If true, use the experimental downsampling implementation * @return The downsampling method, as specified by the walker. Null if none exists. */ - public static DownsamplingMethod getDownsamplingMethod(Walker walker) { - return getDownsamplingMethod(walker.getClass()); + public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) { + return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index b8a7334b3..44817379a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.IntervalBinding; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; @@ -140,15 +140,11 @@ public class GATKArgumentCollection { @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; - /** - * The override mechanism in the GATK, by default, populates the command-line arguments, then - * the defaults from the walker annotations. Unfortunately, walker annotations should be trumped - * by a user explicitly specifying command-line arguments. - * TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments. - */ - private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; - private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000; - + // -------------------------------------------------------------------------------------------------------------- + // + // Downsampling Arguments + // + // -------------------------------------------------------------------------------------------------------------- @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false) public DownsampleType downsamplingType = null; @@ -158,17 +154,20 @@ public class GATKArgumentCollection { @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) public Integer downsampleCoverage = null; + @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false) + @Hidden + public boolean enableExperimentalDownsampling = false; + /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify * a default downsampling mechanism, return the default. * @return The explicitly specified downsampling mechanism, or the default if none exists. */ public DownsamplingMethod getDownsamplingMethod() { - if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null) + if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null ) return null; - if(downsamplingType == null && downsampleCoverage != null) - return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null); - return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction); + + return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling); } /** @@ -178,9 +177,11 @@ public class GATKArgumentCollection { public void setDownsamplingMethod(DownsamplingMethod method) { if (method == null) throw new IllegalArgumentException("method is null"); + downsamplingType = method.type; downsampleCoverage = method.toCoverage; downsampleFraction = method.toFraction; + enableExperimentalDownsampling = method.useExperimentalDownsampling; } // -------------------------------------------------------------------------------------------------------------- @@ -208,15 +209,6 @@ public class GATKArgumentCollection { @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) public File performanceLog = null; - /** - * Gets the default downsampling method, returned if the user didn't specify any downsampling - * method. - * @return The default downsampling mechanism, or null if none exists. - */ - public static DownsamplingMethod getDefaultDownsamplingMethod() { - return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null); - } - @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) public Boolean useOriginalBaseQualities = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java index a3ce6dd27..cd3403f2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.iterators.LocusIterator; @@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View { // Cache the current and apply filtering. AlignmentContext current = nextLocus; - if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) + + // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling: + if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling && + sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) { + current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); + } // Indicate that the next operation will need to advance. nextLocus = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 7d027438b..437813f19 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -30,7 +30,9 @@ import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.*; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -152,6 +154,8 @@ public class SAMDataSource { */ private final ThreadAllocation threadAllocation; + private final boolean expandShardsForDownsampling; + /** * Create a new SAM data source given the supplied read metadata. * @param samFiles list of reads files. @@ -302,6 +306,11 @@ public class SAMDataSource { includeReadsWithDeletionAtLoci, defaultBaseQualities); + expandShardsForDownsampling = readProperties.getDownsamplingMethod() != null && + readProperties.getDownsamplingMethod().useExperimentalDownsampling && + readProperties.getDownsamplingMethod().type != DownsampleType.NONE && + readProperties.getDownsamplingMethod().toCoverage != null; + // cache the read group id (original) -> read group id (merged) // and read group id (merged) -> read group id (original) mappings. for(SAMReaderID id: readerIDs) { @@ -457,6 +466,16 @@ public class SAMDataSource { } } + /** + * Are we expanding shards as necessary to prevent shard boundaries from occurring at improper places? + * + * @return true if we are using expanded shards, otherwise false + */ + public boolean usingExpandedShards() { + return expandShardsForDownsampling; + } + + /** * Fill the given buffering shard with reads. * @param shard Shard to fill. @@ -484,6 +503,31 @@ public class SAMDataSource { } } + // If the reads are sorted in coordinate order, ensure that all reads + // having the same alignment start become part of the same shard, to allow + // downsampling to work better across shard boundaries. Note that because our + // read stream has already been fed through the positional downsampler, which + // ensures that at each alignment start position there are no more than dcov + // reads, we're in no danger of accidentally creating a disproportionately huge + // shard + if ( expandShardsForDownsampling && sortOrder == SAMFileHeader.SortOrder.coordinate ) { + while ( iterator.hasNext() ) { + SAMRecord additionalRead = iterator.next(); + + // Stop filling the shard as soon as we encounter a read having a different + // alignment start or contig from the last read added in the earlier loop + // above, or an unmapped read + if ( read == null || + additionalRead.getReadUnmappedFlag() || + ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) || + additionalRead.getAlignmentStart() != read.getAlignmentStart() ) { + break; + } + shard.addRead(additionalRead); + noteFilePositionUpdate(positionUpdates, additionalRead); + } + } + // If the reads are sorted in queryname order, ensure that all reads // having the same queryname become part of the same shard. if(sortOrder == SAMFileHeader.SortOrder.queryname) { @@ -578,6 +622,7 @@ public class SAMDataSource { iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); if(shard.getGenomeLocs().size() > 0) iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); + iteratorMap.put(readers.getReader(id), iterator); } @@ -660,20 +705,25 @@ public class SAMDataSource { List readTransformers, byte defaultBaseQualities) { - // *********************************************************************************** // - // * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // - // * (otherwise we will process something that we may end up throwing away) * // - // *********************************************************************************** // + // ************************************************************************************************ // + // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // + // * (otherwise we will process something that we may end up throwing away) * // + // ************************************************************************************************ // - if (downsamplingFraction != null) - wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction); + wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + + if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) { + wrappedIterator = applyDownsamplingIterator(wrappedIterator); + } + + // Use the old fractional downsampler only if we're not using experimental downsampling: + if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null ) + wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction); // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, // verify the read ordering by applying a sort order iterator if (!noValidationOfReadOrder && enableVerification) - wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator); - - wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters)); + wrappedIterator = new VerifyingSamIterator(wrappedIterator); if (useOriginalBaseQualities || defaultBaseQualities >= 0) // only wrap if we are replacing the original qualities or using a default base quality @@ -688,6 +738,26 @@ public class SAMDataSource { return wrappedIterator; } + protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) { + if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) { + ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ? + new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) : + new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction); + + return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory); + } + else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) { + ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ? + new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) : + new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction); + + return new DownsamplingReadsIterator(wrappedIterator, downsampler); + } + + return wrappedIterator; + } + + private class SAMResourcePool { /** * How many entries can be cached in this resource pool? diff --git a/public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java similarity index 75% rename from public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java rename to public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java index 3fabf6e0d..c3d17436a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/DownsampleType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsampleType.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk; +package org.broadinstitute.sting.gatk.downsampling; /** * Type of downsampling method to invoke. diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java index 5fb99b2bc..f5741af4e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/Downsampler.java @@ -28,49 +28,92 @@ import java.util.Collection; import java.util.List; /** - * The basic downsampler API, with no reads-specific operations + * The basic downsampler API, with no reads-specific operations. + * + * Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle + * any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a + * PerSampleDownsamplingReadsIterator. * * @author David Roazen */ public interface Downsampler { - /* - * Submit one item to the downsampler for consideration . Some downsamplers will be able to determine + /** + * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine * immediately whether the item survives the downsampling process, while others will need to see * more items before making that determination. + * + * @param item the individual item to submit to the downsampler for consideration */ public void submit( T item ); - /* - * Submit a collection of items to the downsampler for consideration. + /** + * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling + * submit() on each individual item in the collection. + * + * @param items the collection of items to submit to the downsampler for consideration */ public void submit( Collection items ); - /* + /** * Are there items that have survived the downsampling process waiting to be retrieved? + * + * @return true if this downsampler has > 0 finalized items, otherwise false */ - public boolean hasDownsampledItems(); + public boolean hasFinalizedItems(); - /* - * Return (and remove) all items that have survived downsampling and are waiting to be retrieved. + /** + * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved. + * + * @return a list of all finalized items this downsampler contains, or an empty list if there are none */ - public List consumeDownsampledItems(); + public List consumeFinalizedItems(); - /* + /** * Are there items stored in this downsampler that it doesn't yet know whether they will * ultimately survive the downsampling process? + * + * @return true if this downsampler has > 0 pending items, otherwise false */ public boolean hasPendingItems(); - /* + /** + * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items) + * + * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public T peekFinalized(); + + /** + * Peek at the first pending item stored in this downsampler (or null if there are no pending items) + * + * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call), + * or null if there are none + */ + public T peekPending(); + + /** + * Returns the number of items discarded (so far) during the downsampling process + * + * @return the number of items that have been submitted to this downsampler and discarded in the process of + * downsampling + */ + public int getNumberOfDiscardedItems(); + + /** * Used to tell the downsampler that no more items will be submitted to it, and that it should * finalize any pending items. */ public void signalEndOfInput(); - /* - * Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state - * information. + /** + * Empty the downsampler of all finalized/pending items */ public void clear(); + + /** + * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items + */ + public void reset(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java new file mode 100644 index 000000000..ae1d98ce0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Describes the method for downsampling reads at a given locus. + */ + +public class DownsamplingMethod { + /** + * Type of downsampling to perform. + */ + public final DownsampleType type; + + /** + * Actual downsampling target is specified as an integer number of reads. + */ + public final Integer toCoverage; + + /** + * Actual downsampling target is specified as a fraction of total available reads. + */ + public final Double toFraction; + + /** + * Use the new experimental downsampling? + */ + public final boolean useExperimentalDownsampling; + + /** + * Expresses no downsampling applied at all. + */ + public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false); + + /** + * Default type to use if no type is specified + */ + public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; + + /** + * Default target coverage for locus-based traversals + */ + public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000; + + public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) { + this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE; + this.toCoverage = toCoverage; + this.toFraction = toFraction; + this.useExperimentalDownsampling = useExperimentalDownsampling; + + if ( type == DownsampleType.NONE ) { + toCoverage = null; + toFraction = null; + } + + validate(); + } + + private void validate() { + // Can't leave toFraction and toCoverage null unless type is NONE + if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null ) + throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling."); + + // Fraction and coverage cannot both be specified. + if ( toFraction != null && toCoverage != null ) + throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one."); + + // toCoverage must be > 0 when specified + if ( toCoverage != null && toCoverage <= 0 ) { + throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage"); + } + + // toFraction must be >= 0.0 and <= 1.0 when specified + if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) { + throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads"); + } + + // Some restrictions only exist for the old downsampling implementation: + if ( ! useExperimentalDownsampling ) { + // By sample downsampling does not work with a fraction of reads in the old downsampling implementation + if( type == DownsampleType.BY_SAMPLE && toFraction != null ) + throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method"); + } + + // Some restrictions only exist for the new downsampling implementation: + if ( useExperimentalDownsampling ) { + if ( type == DownsampleType.ALL_READS && toCoverage != null ) { + throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation"); + } + } + } + + public String toString() { + StringBuilder builder = new StringBuilder("Downsampling Settings: "); + + if ( type == DownsampleType.NONE ) { + builder.append("No downsampling"); + } + else { + builder.append(String.format("Method: %s ", type)); + + if ( toCoverage != null ) { + builder.append(String.format("Target Coverage: %d ", toCoverage)); + } + else { + builder.append(String.format("Target Fraction: %.2f ", toFraction)); + } + + if ( useExperimentalDownsampling ) { + builder.append("Using Experimental Downsampling"); + } + } + + return builder.toString(); + } + + public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) { + if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) { + return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE, + null, useExperimentalDownsampling); + } + else { + return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java index bccc2e946..c8fbc829c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java @@ -33,7 +33,8 @@ import java.util.NoSuchElementException; /** - * StingSAMIterator wrapper around our generic reads downsampler interface + * StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style + * downsampler interface to a pull model. * * @author David Roazen */ @@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator { private StingSAMIterator nestedSAMIterator; private ReadsDownsampler downsampler; private Collection downsampledReadsCache; - private Iterator downsampledReadsCacheIterator; + private SAMRecord nextRead = null; + private Iterator downsampledReadsCacheIterator = null; + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsampler downsampler through which the reads will be fed + */ public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler downsampler ) { nestedSAMIterator = iter; this.downsampler = downsampler; - fillDownsampledReadsCache(); + + advanceToNextRead(); } public boolean hasNext() { - if ( downsampledReadsCacheIterator.hasNext() ) { - return true; - } - else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) { - return false; - } - - return true; + return nextRead != null; } public SAMRecord next() { - if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) { + if ( nextRead == null ) { throw new NoSuchElementException("next() called when there are no more items"); } - return downsampledReadsCacheIterator.next(); + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = downsampledReadsCacheIterator.next(); + } + } + + private boolean readyToReleaseReads() { + return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext(); } private boolean fillDownsampledReadsCache() { - while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) { + while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) { downsampler.submit(nestedSAMIterator.next()); } @@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator { downsampler.signalEndOfInput(); } - downsampledReadsCache = downsampler.consumeDownsampledItems(); + // use returned collection directly rather than make a copy, for speed + downsampledReadsCache = downsampler.consumeFinalizedItems(); downsampledReadsCacheIterator = downsampledReadsCache.iterator(); return downsampledReadsCacheIterator.hasNext(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java index d5d529c9f..8901ae525 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java @@ -33,7 +33,10 @@ import java.util.Collection; import java.util.List; /** - * Fractional Downsampler: selects a specified fraction of the reads for inclusion + * Fractional Downsampler: selects a specified fraction of the reads for inclusion. + * + * Since the selection is done randomly, the actual fraction of reads retained may be slightly + * more or less than the requested fraction, depending on the total number of reads submitted. * * @author David Roazen */ @@ -43,8 +46,16 @@ public class FractionalDownsampler implements ReadsDownsamp private int cutoffForInclusion; + private int numDiscardedItems; + private static final int RANDOM_POOL_SIZE = 10000; + /** + * Construct a FractionalDownsampler + * + * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive). + * Actual number of reads preserved may differ randomly. + */ public FractionalDownsampler( double fraction ) { if ( fraction < 0.0 || fraction > 1.0 ) { throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); @@ -52,12 +63,16 @@ public class FractionalDownsampler implements ReadsDownsamp cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); clear(); + reset(); } public void submit( T newRead ) { if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) { selectedReads.add(newRead); } + else { + numDiscardedItems++; + } } public void submit( Collection newReads ) { @@ -66,11 +81,12 @@ public class FractionalDownsampler implements ReadsDownsamp } } - public boolean hasDownsampledItems() { + public boolean hasFinalizedItems() { return selectedReads.size() > 0; } - public List consumeDownsampledItems() { + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed List downsampledItems = selectedReads; clear(); return downsampledItems; @@ -80,6 +96,18 @@ public class FractionalDownsampler implements ReadsDownsamp return false; } + public T peekFinalized() { + return selectedReads.isEmpty() ? null : selectedReads.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + public void signalEndOfInput() { // NO-OP } @@ -88,7 +116,15 @@ public class FractionalDownsampler implements ReadsDownsamp selectedReads = new ArrayList(); } + public void reset() { + numDiscardedItems = 0; + } + public boolean requiresCoordinateSortOrder() { return false; } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java new file mode 100644 index 000000000..7a7c9e91e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating FractionalDownsamplers on demand + * + * @author David Roazen + */ +public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private double fraction; + + public FractionalDownsamplerFactory( double fraction ) { + this.fraction = fraction; + } + + public ReadsDownsampler newInstance() { + return new FractionalDownsampler(fraction); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java new file mode 100644 index 000000000..73d69140d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.*; + +/** + * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from + * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling + * does not occur until all Lists have been submitted and signalEndOfInput() is called. + * + * The Lists should be LinkedLists for maximum efficiency during item removal, however other + * kinds of Lists are also accepted (albeit at a slight performance penalty). + * + * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface, + * the Lists need not contain reads. However this downsampler may not be wrapped within one of the + * DownsamplingReadsIterators + * + * @param the List type representing the stacks to be leveled + * @param the type of the elements of each List + * + * @author David Roazen + */ +public class LevelingDownsampler, E> implements Downsampler { + + private int targetSize; + + private List groups; + + private boolean groupsAreFinalized; + + private int numDiscardedItems; + + /** + * Construct a LevelingDownsampler + * + * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed + * this value -- if it does, items are removed from Lists evenly until the total size + * is <= this value + */ + public LevelingDownsampler( int targetSize ) { + this.targetSize = targetSize; + clear(); + reset(); + } + + public void submit( T item ) { + groups.add(item); + } + + public void submit( Collection items ){ + groups.addAll(items); + } + + public boolean hasFinalizedItems() { + return groupsAreFinalized && groups.size() > 0; + } + + public List consumeFinalizedItems() { + if ( ! hasFinalizedItems() ) { + return new ArrayList(); + } + + // pass by reference rather than make a copy, for speed + List toReturn = groups; + clear(); + return toReturn; + } + + public boolean hasPendingItems() { + return ! groupsAreFinalized && groups.size() > 0; + } + + public T peekFinalized() { + return hasFinalizedItems() ? groups.get(0) : null; + } + + public T peekPending() { + return hasPendingItems() ? groups.get(0) : null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + levelGroups(); + groupsAreFinalized = true; + } + + public void clear() { + groups = new ArrayList(); + groupsAreFinalized = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + private void levelGroups() { + int totalSize = 0; + int[] groupSizes = new int[groups.size()]; + int currentGroupIndex = 0; + + for ( T group : groups ) { + groupSizes[currentGroupIndex] = group.size(); + totalSize += groupSizes[currentGroupIndex]; + currentGroupIndex++; + } + + if ( totalSize <= targetSize ) { + return; // no need to eliminate any items + } + + // We will try to remove exactly this many items, however we will refuse to allow any + // one group to fall below size 1, and so might end up removing fewer items than this + int numItemsToRemove = totalSize - targetSize; + + currentGroupIndex = 0; + int numConsecutiveUmodifiableGroups = 0; + + // Continue until we've either removed all the items we wanted to, or we can't + // remove any more items without violating the constraint that all groups must + // be left with at least one item + while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) { + if ( groupSizes[currentGroupIndex] > 1 ) { + groupSizes[currentGroupIndex]--; + numItemsToRemove--; + numConsecutiveUmodifiableGroups = 0; + } + else { + numConsecutiveUmodifiableGroups++; + } + + currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length; + } + + // Now we actually go through and reduce each group to its new count as specified in groupSizes + currentGroupIndex = 0; + for ( T group : groups ) { + downsampleOneGroup(group, groupSizes[currentGroupIndex]); + currentGroupIndex++; + } + } + + private void downsampleOneGroup( T group, int numItemsToKeep ) { + if ( numItemsToKeep >= group.size() ) { + return; + } + + numDiscardedItems += group.size() - numItemsToKeep; + + BitSet itemsToKeep = new BitSet(group.size()); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) { + itemsToKeep.set(selectedIndex); + } + + int currentIndex = 0; + + // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator + if ( group instanceof LinkedList ) { + Iterator iter = group.iterator(); + while ( iter.hasNext() ) { + iter.next(); + + if ( ! itemsToKeep.get(currentIndex) ) { + iter.remove(); + } + + currentIndex++; + } + } + // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather + // than suffer O(n^2) of item shifting + else { + List keptItems = new ArrayList(numItemsToKeep); + + for ( E item : group ) { + if ( itemsToKeep.get(currentIndex) ) { + keptItems.add(item); + } + currentIndex++; + } + group.clear(); + group.addAll(keptItems); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java new file mode 100644 index 000000000..8b2034460 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMRecordComparator; +import net.sf.samtools.SAMRecordCoordinateComparator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; + +import java.util.*; + + +/** + * StingSAMIterator wrapper around our generic reads downsampler interface + * that downsamples reads for each sample independently, and then re-assembles + * the reads back into a single merged stream. + * + * @author David Roazen + */ +public class PerSampleDownsamplingReadsIterator implements StingSAMIterator { + + private StingSAMIterator nestedSAMIterator; + private ReadsDownsamplerFactory downsamplerFactory; + private Map> perSampleDownsamplers; + private PriorityQueue orderedDownsampledReadsCache; + private SAMRecord nextRead = null; + private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator(); + private SAMRecord earliestPendingRead = null; + private ReadsDownsampler earliestPendingDownsampler = null; + + // Initial size of our cache of finalized reads + private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096; + + // The number of positional changes that can occur in the read stream before all downsamplers + // should be informed of the current position (guards against samples with relatively sparse reads + // getting stuck in a pending state): + private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value + + /** + * @param iter wrapped iterator from which this iterator will pull reads + * @param downsamplerFactory factory used to create new downsamplers as needed + */ + public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) { + nestedSAMIterator = iter; + this.downsamplerFactory = downsamplerFactory; + perSampleDownsamplers = new HashMap>(); + orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator); + + advanceToNextRead(); + } + + public boolean hasNext() { + return nextRead != null; + } + + public SAMRecord next() { + if ( nextRead == null ) { + throw new NoSuchElementException("next() called when there are no more items"); + } + + SAMRecord toReturn = nextRead; + advanceToNextRead(); + + return toReturn; + } + + private void advanceToNextRead() { + if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) { + nextRead = null; + } + else { + nextRead = orderedDownsampledReadsCache.poll(); + } + } + + private boolean readyToReleaseReads() { + if ( orderedDownsampledReadsCache.isEmpty() ) { + return false; + } + + return earliestPendingRead == null || + readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0; + } + + private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) { + // If there is no recorded earliest pending read and this downsampler has pending items, + // then this downsampler's first pending item becomes the new earliest pending read: + if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) { + earliestPendingRead = currentDownsampler.peekPending(); + earliestPendingDownsampler = currentDownsampler; + } + // In all other cases, we only need to update the earliest pending read when the downsampler + // associated with it experiences a change in its pending reads, since by assuming a sorted + // read stream we're assured that each downsampler's earliest pending read will only increase + // in genomic position over time. + // + // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers + // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))), + // TODO: but need to verify this empirically. + else if ( currentDownsampler == earliestPendingDownsampler && + (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) { + + earliestPendingRead = null; + earliestPendingDownsampler = null; + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasPendingItems() && + (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) { + + earliestPendingRead = perSampleDownsampler.peekPending(); + earliestPendingDownsampler = perSampleDownsampler; + } + } + } + } + + private boolean fillDownsampledReadsCache() { + SAMRecord prevRead = null; + int numPositionalChanges = 0; + + // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue + // can be released without violating global sort order + while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) { + SAMRecord read = nestedSAMIterator.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName); + if ( thisSampleDownsampler == null ) { + thisSampleDownsampler = downsamplerFactory.newInstance(); + perSampleDownsamplers.put(sampleName, thisSampleDownsampler); + } + + thisSampleDownsampler.submit(read); + updateEarliestPendingRead(thisSampleDownsampler); + + if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) { + numPositionalChanges++; + } + + // If the number of times we've changed position exceeds a certain threshold, inform all + // downsamplers of the current position in the read stream. This is to prevent downsamplers + // for samples with sparser reads than others from getting stuck too long in a pending state. + if ( numPositionalChanges > DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalNoMoreReadsBefore(read); + updateEarliestPendingRead(perSampleDownsampler); + } + } + + prevRead = read; + } + + if ( ! nestedSAMIterator.hasNext() ) { + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + perSampleDownsampler.signalEndOfInput(); + } + earliestPendingRead = null; + earliestPendingDownsampler = null; + } + + for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) { + if ( perSampleDownsampler.hasFinalizedItems() ) { + orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems()); + } + } + + return readyToReleaseReads(); + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + public void close() { + nestedSAMIterator.close(); + } + + public Iterator iterator() { + return this; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java deleted file mode 100644 index f29c7728c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.*; - -/** - * Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions - * - * @author David Roazen - */ -public class PositionalDownsampler implements ReadsDownsampler { - - private int targetCoverage; - - private ReservoirDownsampler reservoir; - - private int currentContigIndex; - - private int currentAlignmentStart; - - private LinkedList pendingReads; - - private ArrayList finalizedReads; - - public PositionalDownsampler ( int targetCoverage ) { - this.targetCoverage = targetCoverage; - clear(); - } - - public void submit ( T newRead ) { - if ( readIsPastCurrentPosition(newRead) ) { - updateAndDownsamplePendingReads(); - } - - reservoir.submit(newRead); - updateCurrentPosition(newRead); - } - - public void submit ( Collection newReads ) { - for ( T read : newReads ) { - submit(read); - } - } - - public boolean hasDownsampledItems() { - return finalizedReads.size() > 0; - } - - public List consumeDownsampledItems() { - List toReturn = finalizedReads; - finalizedReads = new ArrayList(); - return toReturn; - } - - public boolean hasPendingItems() { - return pendingReads.size() > 0; - } - - public void signalEndOfInput() { - updateAndDownsamplePendingReads(); - - for ( PositionalReadGrouping group : pendingReads ) { - group.finalizeAllActiveReads(); - finalizedReads.addAll(group.getFinalizedReads()); - } - - pendingReads.clear(); - } - - public void clear() { - reservoir = new ReservoirDownsampler(targetCoverage); - pendingReads = new LinkedList(); - finalizedReads = new ArrayList(); - } - - public boolean requiresCoordinateSortOrder() { - return true; - } - - private void updateCurrentPosition ( T read ) { - currentContigIndex = read.getReferenceIndex(); - currentAlignmentStart = read.getAlignmentStart(); - } - - private boolean readIsPastCurrentPosition ( T read ) { - return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart; - } - - private void updateAndDownsamplePendingReads() { - finalizeOutOfScopeReads(); - - List oldLocusReads = reservoir.consumeDownsampledItems(); - pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart)); - - downsampleOverlappingGroups(); - } - - private void finalizeOutOfScopeReads() { - Iterator iter = pendingReads.iterator(); - boolean noPrecedingUnfinalizedGroups = true; - - while ( iter.hasNext() ) { - PositionalReadGrouping currentGroup = iter.next(); - currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart); - - if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) { - iter.remove(); - finalizedReads.addAll(currentGroup.getFinalizedReads()); - } - else { - noPrecedingUnfinalizedGroups = false; - } - } - } - - private void downsampleOverlappingGroups() { - int[] groupReadCounts = new int[pendingReads.size()]; - int totalCoverage = 0; - int numActiveGroups = 0; - int currentGroup = 0; - - for ( PositionalReadGrouping group : pendingReads ) { - groupReadCounts[currentGroup] = group.numActiveReads(); - totalCoverage += groupReadCounts[currentGroup]; - - if ( groupReadCounts[currentGroup] > 0 ) { - numActiveGroups++; - } - - currentGroup++; - } - - if ( totalCoverage <= targetCoverage ) { - return; - } - - int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups); - currentGroup = 0; - - while ( numReadsToRemove > 0 ) { - if ( groupReadCounts[currentGroup] > 1 ) { - groupReadCounts[currentGroup]--; - numReadsToRemove--; - } - - currentGroup = (currentGroup + 1) % groupReadCounts.length; - } - - currentGroup = 0; - for ( PositionalReadGrouping group : pendingReads ) { - if ( ! group.isFinalized() ) { - group.downsampleActiveReads(groupReadCounts[currentGroup]); - } - currentGroup++; - } - } - - private class PositionalReadGrouping { - private List activeReads; - private List finalizedReads; - - private int contig; - private int alignmentStart; - - public PositionalReadGrouping( Collection reads, int contig, int alignmentStart ) { - activeReads = new LinkedList(reads); - finalizedReads = new ArrayList(); - this.contig = contig; - this.alignmentStart = alignmentStart; - } - - public int numActiveReads() { - return activeReads.size(); - } - - public boolean isFinalized() { - return activeReads.size() == 0; - } - - public List getFinalizedReads() { - return finalizedReads; - } - - public void finalizeActiveReadsBeforePosition( int contig, int position ) { - if ( this.contig != contig ) { - finalizeAllActiveReads(); - return; - } - - Iterator iter = activeReads.iterator(); - - while ( iter.hasNext() ) { - T read = iter.next(); - if ( read.getAlignmentEnd() < position ) { - iter.remove(); - finalizedReads.add(read); - } - } - } - - public void finalizeAllActiveReads() { - finalizedReads.addAll(activeReads); - activeReads.clear(); - } - - public void downsampleActiveReads( int numReadsToKeep ) { - if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) { - throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads", - numReadsToKeep, activeReads.size())); - } - - BitSet itemsToKeep = new BitSet(activeReads.size()); - for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) { - itemsToKeep.set(selectedIndex); - } - - int currentIndex = 0; - Iterator iter = activeReads.iterator(); - - while ( iter.hasNext() ) { - T read = iter.next(); - - if ( ! itemsToKeep.get(currentIndex) ) { - iter.remove(); - } - - currentIndex++; - } - } - - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java index f78aaf4bf..3ff6f4454 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java @@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord; */ public interface ReadsDownsampler extends Downsampler { - /* + /** * Does this downsampler require that reads be fed to it in coordinate order? + * + * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false */ public boolean requiresCoordinateSortOrder(); + + /** + * Tell this downsampler that no more reads located before the provided read (according to + * the sort order of the read stream) will be fed to it. + * + * Allows position-aware downsamplers to finalize pending reads earlier than they would + * otherwise be able to, particularly when doing per-sample downsampling and reads for + * certain samples are sparser than average. + * + * @param read the downsampler will assume that no reads located before this read will ever + * be submitted to it in the future + */ + public void signalNoMoreReadsBefore( T read ); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java new file mode 100644 index 000000000..2fa32497b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular + * downsampler, all sharing the same construction parameters. + * + * @author David Roazen + */ +public interface ReadsDownsamplerFactory { + public ReadsDownsampler newInstance(); +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java index cb40c7042..bab4734c4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java @@ -48,6 +48,14 @@ public class ReservoirDownsampler implements ReadsDownsampl private int totalReadsSeen; + private int numDiscardedItems; + + /** + * Construct a ReservoirDownsampler + * + * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained + * after downsampling will be min(totalReads, targetSampleSize) + */ public ReservoirDownsampler ( int targetSampleSize ) { if ( targetSampleSize <= 0 ) { throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0"); @@ -55,6 +63,7 @@ public class ReservoirDownsampler implements ReadsDownsampl this.targetSampleSize = targetSampleSize; clear(); + reset(); } public void submit ( T newRead ) { @@ -68,6 +77,7 @@ public class ReservoirDownsampler implements ReadsDownsampl if ( randomSlot < targetSampleSize ) { reservoir.set(randomSlot, newRead); } + numDiscardedItems++; } } @@ -77,11 +87,12 @@ public class ReservoirDownsampler implements ReadsDownsampl } } - public boolean hasDownsampledItems() { + public boolean hasFinalizedItems() { return reservoir.size() > 0; } - public List consumeDownsampledItems() { + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed List downsampledItems = reservoir; clear(); return downsampledItems; @@ -91,16 +102,36 @@ public class ReservoirDownsampler implements ReadsDownsampl return false; } + public T peekFinalized() { + return reservoir.isEmpty() ? null : reservoir.get(0); + } + + public T peekPending() { + return null; + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + public void signalEndOfInput() { // NO-OP } public void clear() { reservoir = new ArrayList(targetSampleSize); - totalReadsSeen = 0; + totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below + } + + public void reset() { + numDiscardedItems = 0; } public boolean requiresCoordinateSortOrder() { return false; } + + public void signalNoMoreReadsBefore( T read ) { + // NO-OP + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java new file mode 100644 index 000000000..040f0c788 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating ReservoirDownsamplers on demand + * + * @author David Roazen + */ +public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetSampleSize; + + public ReservoirDownsamplerFactory( int targetSampleSize ) { + this.targetSampleSize = targetSampleSize; + } + + public ReadsDownsampler newInstance() { + return new ReservoirDownsampler(targetSampleSize); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java new file mode 100644 index 000000000..30affc2b3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +import java.util.*; + +/** + * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage + * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time. + * + * @author David Roazen + */ +public class SimplePositionalDownsampler implements ReadsDownsampler { + + private int targetCoverage; + + private ReservoirDownsampler reservoir; + + private int currentContigIndex; + + private int currentAlignmentStart; + + private boolean positionEstablished; + + private boolean unmappedReadsReached; + + private ArrayList finalizedReads; + + private int numDiscardedItems; + + /** + * Construct a SimplePositionalDownsampler + * + * @param targetCoverage Maximum number of reads that may share any given alignment start position + */ + public SimplePositionalDownsampler( int targetCoverage ) { + this.targetCoverage = targetCoverage; + reservoir = new ReservoirDownsampler(targetCoverage); + finalizedReads = new ArrayList(); + clear(); + reset(); + } + + public void submit( T newRead ) { + updatePositionalState(newRead); + + if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream + finalizedReads.add(newRead); + } + else { + int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems(); + reservoir.submit(newRead); + numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems; + } + } + + public void submit( Collection newReads ) { + for ( T read : newReads ) { + submit(read); + } + } + + public boolean hasFinalizedItems() { + return finalizedReads.size() > 0; + } + + public List consumeFinalizedItems() { + // pass by reference rather than make a copy, for speed + List toReturn = finalizedReads; + finalizedReads = new ArrayList(); + return toReturn; + } + + public boolean hasPendingItems() { + return reservoir.hasFinalizedItems(); + } + + public T peekFinalized() { + return finalizedReads.isEmpty() ? null : finalizedReads.get(0); + } + + public T peekPending() { + return reservoir.peekFinalized(); + } + + public int getNumberOfDiscardedItems() { + return numDiscardedItems; + } + + public void signalEndOfInput() { + finalizeReservoir(); + } + + public void clear() { + reservoir.clear(); + reservoir.reset(); + finalizedReads.clear(); + positionEstablished = false; + unmappedReadsReached = false; + } + + public void reset() { + numDiscardedItems = 0; + } + + public boolean requiresCoordinateSortOrder() { + return true; + } + + public void signalNoMoreReadsBefore( T read ) { + updatePositionalState(read); + } + + private void updatePositionalState( T newRead ) { + if ( readIsPastCurrentPosition(newRead) ) { + if ( reservoir.hasFinalizedItems() ) { + finalizeReservoir(); + } + + setCurrentPosition(newRead); + + if ( newRead.getReadUnmappedFlag() ) { + unmappedReadsReached = true; + } + } + } + + private void setCurrentPosition( T read ) { + currentContigIndex = read.getReferenceIndex(); + currentAlignmentStart = read.getAlignmentStart(); + positionEstablished = true; + } + + private boolean readIsPastCurrentPosition( T read ) { + return ! positionEstablished || + read.getReferenceIndex() > currentContigIndex || + read.getAlignmentStart() > currentAlignmentStart || + (read.getReadUnmappedFlag() && ! unmappedReadsReached); + } + + private void finalizeReservoir() { + finalizedReads.addAll(reservoir.consumeFinalizedItems()); + reservoir.reset(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java new file mode 100644 index 000000000..fcc18b16b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMRecord; + +/** + * Factory for creating SimplePositionalDownsamplers on demand + * + * @author David Roazen + */ +public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory { + + private int targetCoverage; + + public SimplePositionalDownsamplerFactory( int targetCoverage ) { + this.targetCoverage = targetCoverage; + } + + public ReadsDownsampler newInstance() { + return new SimplePositionalDownsampler(targetCoverage); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index da11d36dd..6c0dc9769 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -81,7 +82,13 @@ public class WindowMaker implements Iterable, I public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) { this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + + // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested: + this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ? + new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames)) + : + new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); + this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java similarity index 88% rename from public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java index 835748ff0..c0de06b49 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java @@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import java.util.Iterator; -public class DownsampleIterator implements StingSAMIterator { +public class LegacyDownsampleIterator implements StingSAMIterator { StingSAMIterator it; int cutoff; SAMRecord next; - public DownsampleIterator(StingSAMIterator it, double fraction) { + public LegacyDownsampleIterator(StingSAMIterator it, double fraction) { this.it = it; cutoff = (int)(fraction * 10000); next = getNextRecord(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 64f914064..46e84798a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.DownsampleType; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLoc; diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java new file mode 100755 index 000000000..557cbd009 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java @@ -0,0 +1,649 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.picard.util.PeekableIterator; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.Downsampler; +import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.util.*; + +/** + * Iterator that traverses a SAM File, accumulating information on a per-locus basis + */ +public class LocusIteratorByStateExperimental extends LocusIterator { + /** + * our log, which we want to capture anything from this class + */ + private static Logger logger = Logger.getLogger(LocusIteratorByState.class); + + // ----------------------------------------------------------------------------------------------------------------- + // + // member fields + // + // ----------------------------------------------------------------------------------------------------------------- + + /** + * Used to create new GenomeLocs. + */ + private final GenomeLocParser genomeLocParser; + private final ArrayList samples; + private final ReadStateManager readStates; + + protected static class SAMRecordState { + SAMRecord read; + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + + Cigar cigar = null; + int cigarOffset = -1; + CigarElement curElement = null; + int nCigarElements = 0; + + int cigarElementCounter = -1; // how far are we into a single cigarElement + + // The logical model for generating extended events is as follows: the "record state" implements the traversal + // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This + // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the + // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or + // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from + // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended + // events immediately preceding the current reference base). + + public SAMRecordState(SAMRecord read) { + this.read = read; + cigar = read.getCigar(); + nCigarElements = cigar.numCigarElements(); + + //System.out.printf("Creating a SAMRecordState: %s%n", this); + } + + public SAMRecord getRead() { + return read; + } + + /** + * What is our current offset in the read's bases that aligns us with the reference genome? + * + * @return + */ + public int getReadOffset() { + return readOffset; + } + + /** + * What is the current offset w.r.t. the alignment state that aligns us to the readOffset? + * + * @return + */ + public int getGenomeOffset() { + return genomeOffset; + } + + public int getGenomePosition() { + return read.getAlignmentStart() + getGenomeOffset(); + } + + public GenomeLoc getLocation(GenomeLocParser genomeLocParser) { + return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition()); + } + + public CigarOperator getCurrentCigarOperator() { + return curElement.getOperator(); + } + + public String toString() { + return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); + } + + public CigarElement peekForwardOnGenome() { + return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ); + } + + public CigarElement peekBackwardOnGenome() { + return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement ); + } + + + public CigarOperator stepForwardOnGenome() { + // we enter this method with readOffset = index of the last processed base on the read + // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion + + + if (curElement == null || ++cigarElementCounter > curElement.getLength()) { + cigarOffset++; + if (cigarOffset < nCigarElements) { + curElement = cigar.getCigarElement(cigarOffset); + cigarElementCounter = 0; + // next line: guards against cigar elements of length 0; when new cigar element is retrieved, + // we reenter in order to re-check cigarElementCounter against curElement's length + return stepForwardOnGenome(); + } else { + if (curElement != null && curElement.getOperator() == CigarOperator.D) + throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + + // Reads that contain indels model the genomeOffset as the following base in the reference. Because + // we fall into this else block only when indels end the read, increment genomeOffset such that the + // current offset of this read is the next ref base after the end of the indel. This position will + // model a point on the reference somewhere after the end of the read. + genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here: + // we do step forward on the ref, and by returning null we also indicate that we are past the read end. + + return null; + } + } + + boolean done = false; + switch (curElement.getOperator()) { + case H: // ignore hard clips + case P: // ignore pads + cigarElementCounter = curElement.getLength(); + break; + case I: // insertion w.r.t. the reference + case S: // soft clip + cigarElementCounter = curElement.getLength(); + readOffset += curElement.getLength(); + break; + case D: // deletion w.r.t. the reference + if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string + throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar"); + // should be the same as N case + genomeOffset++; + done = true; + break; + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + genomeOffset++; + done = true; + break; + case M: + case EQ: + case X: + readOffset++; + genomeOffset++; + done = true; + break; + default: + throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator()); + } + + return done ? curElement.getOperator() : stepForwardOnGenome(); + } + } + + //final boolean DEBUG = false; + //final boolean DEBUG2 = false && DEBUG; + private ReadProperties readInfo; + private AlignmentContext nextAlignmentContext; + private boolean performLevelingDownsampling; + + // ----------------------------------------------------------------------------------------------------------------- + // + // constructors and other basic operations + // + // ----------------------------------------------------------------------------------------------------------------- + + public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) { + this.readInfo = readInformation; + this.genomeLocParser = genomeLocParser; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator); + + this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null && + readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE && + readInfo.getDownsamplingMethod().toCoverage != null; + + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if (this.samples.isEmpty() && samIterator.hasNext()) { + throw new IllegalArgumentException("samples list must not be empty"); + } + } + + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public final static Collection sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; + } + + public Iterator iterator() { + return this; + } + + public void close() { + //this.it.close(); + } + + public boolean hasNext() { + lazyLoadNextAlignmentContext(); + return (nextAlignmentContext != null); + //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); + } + + private GenomeLoc getLocation() { + return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); + } + + // ----------------------------------------------------------------------------------------------------------------- + // + // next() routine and associated collection operations + // + // ----------------------------------------------------------------------------------------------------------------- + public AlignmentContext next() { + lazyLoadNextAlignmentContext(); + if (!hasNext()) + throw new NoSuchElementException("LocusIteratorByState: out of elements."); + AlignmentContext currentAlignmentContext = nextAlignmentContext; + nextAlignmentContext = null; + return currentAlignmentContext; + } + + /** + * Creates the next alignment context from the given state. Note that this is implemented as a lazy load method. + * nextAlignmentContext MUST BE null in order for this method to advance to the next entry. + */ + private void lazyLoadNextAlignmentContext() { + while (nextAlignmentContext == null && readStates.hasNext()) { + readStates.collectPendingReads(); + + final GenomeLoc location = getLocation(); + final Map fullPileup = new HashMap(); + + // TODO: How can you determine here whether the current pileup has been downsampled? + boolean hasBeenSampled = false; + + for (final String sample : samples) { + final Iterator iterator = readStates.iterator(sample); + final List pile = new ArrayList(readStates.size(sample)); + + int size = 0; // number of elements in this sample's pileup + int nDeletions = 0; // number of deletions in this sample's pileup + int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0) + + while (iterator.hasNext()) { + final SAMRecordState state = iterator.next(); // state object with the read/offset information + final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read + final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element + final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element + final boolean isSingleElementCigar = nextElement == lastElement; + final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator + final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator + int readOffset = state.getReadOffset(); // the base offset on this read + + final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION; + final boolean isAfterDeletion = lastOp == CigarOperator.DELETION; + final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION; + final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION && !isSingleElementCigar; + final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart()); + + int nextElementLength = nextElement.getLength(); + + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + + if (op == CigarOperator.D) { + // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix + if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so + pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1)); + size++; + nDeletions++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + else { + if (!filterBaseInRead(read, location.getStart())) { + String insertedBaseString = null; + if (nextOp == CigarOperator.I) { + final int insertionOffset = isSingleElementCigar ? 0 : 1; + // TODO -- someone please implement a better fix for the single element insertion CIGAR! + if (isSingleElementCigar) + readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases! + insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength())); + } + + pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength)); + size++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; + } + } + } + + if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup + fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads)); + } + + updateReadStates(); // critical - must be called after we get the current state offsets and location + if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done + nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled); + } + } + + // fast testing of position + private boolean readIsPastCurrentPosition(SAMRecord read) { + if (readStates.isEmpty()) + return false; + else { + SAMRecordState state = readStates.getFirst(); + SAMRecord ourRead = state.getRead(); + return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition(); + } + } + + /** + * Generic place to put per-base filters appropriate to LocusIteratorByState + * + * @param rec + * @param pos + * @return + */ + private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) { + return ReadUtils.isBaseInsideAdaptor(rec, pos); + } + + private void updateReadStates() { + for (final String sample : samples) { + Iterator it = readStates.iterator(sample); + while (it.hasNext()) { + SAMRecordState state = it.next(); + CigarOperator op = state.stepForwardOnGenome(); + if (op == null) { + // we discard the read only when we are past its end AND indel at the end of the read (if any) was + // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe + // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag. + it.remove(); // we've stepped off the end of the object + } + } + } + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + protected class ReadStateManager { + private final PeekableIterator iterator; + private final SamplePartitioner samplePartitioner; + private final Map readStatesBySample = new HashMap(); + private int totalReadStates = 0; + + public ReadStateManager(Iterator source) { + this.iterator = new PeekableIterator(source); + + for (final String sample : samples) { + readStatesBySample.put(sample, new PerSampleReadStateManager()); + } + + samplePartitioner = new SamplePartitioner(); + } + + /** + * Returns a iterator over all the reads associated with the given sample. Note that remove() is implemented + * for this iterator; if present, total read states will be decremented. + * + * @param sample The sample. + * @return Iterator over the reads associated with that sample. + */ + public Iterator iterator(final String sample) { + return new Iterator() { + private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); + + public boolean hasNext() { + return wrappedIterator.hasNext(); + } + + public SAMRecordState next() { + return wrappedIterator.next(); + } + + public void remove() { + wrappedIterator.remove(); + } + }; + } + + public boolean isEmpty() { + return totalReadStates == 0; + } + + /** + * Retrieves the total number of reads in the manager across all samples. + * + * @return Total number of reads over all samples. + */ + public int size() { + return totalReadStates; + } + + /** + * Retrieves the total number of reads in the manager in the given sample. + * + * @param sample The sample. + * @return Total number of reads in the given sample. + */ + public int size(final String sample) { + return readStatesBySample.get(sample).size(); + } + + public SAMRecordState getFirst() { + for (final String sample : samples) { + PerSampleReadStateManager reads = readStatesBySample.get(sample); + if (!reads.isEmpty()) + return reads.peek(); + } + return null; + } + + public boolean hasNext() { + return totalReadStates > 0 || iterator.hasNext(); + } + + public void collectPendingReads() { + if (!iterator.hasNext()) + return; + + if (readStates.size() == 0) { + int firstContigIndex = iterator.peek().getReferenceIndex(); + int firstAlignmentStart = iterator.peek().getAlignmentStart(); + while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) { + samplePartitioner.submitRead(iterator.next()); + } + } else { + // Fast fail in the case that the read is past the current position. + if (readIsPastCurrentPosition(iterator.peek())) + return; + + while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { + samplePartitioner.submitRead(iterator.next()); + } + } + + for (final String sample : samples) { + Collection newReads = samplePartitioner.getReadsForSample(sample); + PerSampleReadStateManager statesBySample = readStatesBySample.get(sample); + addReadsToSample(statesBySample, newReads); + } + + samplePartitioner.reset(); + } + + /** + * Add reads with the given sample name to the given hanger entry. + * + * @param readStates The list of read states to add this collection of reads. + * @param reads Reads to add. Selected reads will be pulled from this source. + */ + private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) { + if (reads.isEmpty()) + return; + + Collection newReadStates = new LinkedList(); + + for (SAMRecord read : reads) { + SAMRecordState state = new SAMRecordState(read); + state.stepForwardOnGenome(); + newReadStates.add(state); + } + + readStates.addStatesAtNextAlignmentStart(newReadStates); + } + + protected class PerSampleReadStateManager implements Iterable { + private List> readStatesByAlignmentStart = new LinkedList>(); + private int thisSampleReadStates = 0; + private Downsampler> levelingDownsampler = + performLevelingDownsampling ? + new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) : + null; + + public void addStatesAtNextAlignmentStart(Collection states) { + if ( states.isEmpty() ) { + return; + } + + readStatesByAlignmentStart.add(new LinkedList(states)); + thisSampleReadStates += states.size(); + totalReadStates += states.size(); + + if ( levelingDownsampler != null ) { + levelingDownsampler.submit(readStatesByAlignmentStart); + levelingDownsampler.signalEndOfInput(); + + thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems(); + + // use returned List directly rather than make a copy, for efficiency's sake + readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems(); + levelingDownsampler.reset(); + } + } + + public boolean isEmpty() { + return readStatesByAlignmentStart.isEmpty(); + } + + public SAMRecordState peek() { + return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek(); + } + + public int size() { + return thisSampleReadStates; + } + + public Iterator iterator() { + return new Iterator() { + private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator(); + private LinkedList currentPositionReadStates = null; + private Iterator currentPositionReadStatesIterator = null; + + public boolean hasNext() { + return alignmentStartIterator.hasNext() || + (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext()); + } + + public SAMRecordState next() { + if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) { + currentPositionReadStates = alignmentStartIterator.next(); + currentPositionReadStatesIterator = currentPositionReadStates.iterator(); + } + + return currentPositionReadStatesIterator.next(); + } + + public void remove() { + currentPositionReadStatesIterator.remove(); + thisSampleReadStates--; + totalReadStates--; + + if ( currentPositionReadStates.isEmpty() ) { + alignmentStartIterator.remove(); + } + } + }; + } + } + } + + /** + * Note: stores reads by sample ID string, not by sample object + */ + private class SamplePartitioner { + private Map> readsBySample; + private long readsSeen = 0; + + public SamplePartitioner() { + readsBySample = new HashMap>(); + + for ( String sample : samples ) { + readsBySample.put(sample, new ArrayList()); + } + } + + public void submitRead(SAMRecord read) { + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + if (readsBySample.containsKey(sampleName)) + readsBySample.get(sampleName).add(read); + readsSeen++; + } + + public long getNumReadsSeen() { + return readsSeen; + } + + public Collection getReadsForSample(String sampleName) { + if ( ! readsBySample.containsKey(sampleName) ) + throw new NoSuchElementException("Sample name not found"); + return readsBySample.get(sampleName); + } + + public void reset() { + for ( Collection perSampleReads : readsBySample.values() ) + perSampleReads.clear(); + readsSeen = 0; + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 3ffe95e8b..9578bba56 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -10,13 +10,11 @@ import java.util.Iterator; * Verifies that the incoming stream of reads is correctly sorted */ public class VerifyingSamIterator implements StingSAMIterator { - private GenomeLocParser genomeLocParser; StingSAMIterator it; SAMRecord last = null; boolean checkOrderP = true; - public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) { - this.genomeLocParser = genomeLocParser; + public VerifyingSamIterator(StingSAMIterator it) { this.it = it; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java index d662b0092..de2cd836c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Downsample.java @@ -1,6 +1,6 @@ package org.broadinstitute.sting.gatk.walkers; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import java.lang.annotation.*; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index c5b043b7a..44b0d74ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 32ceff715..0d1997252 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 0c096ea73..759ec1cc6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -75,6 +75,17 @@ public class MathUtils { } } + /** + * Get a random int between min and max (inclusive) using the global GATK random number generator + * + * @param min lower bound of the range + * @param max upper bound of the range + * @return a random int >= min and <= max + */ + public static int randomIntegerInRange( int min, int max ) { + return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; + } + // A fast implementation of the Math.round() method. This method does not perform // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). @@ -1655,5 +1666,4 @@ public class MathUtils { return result; } - } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3d986f666..ed6fc46bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -613,6 +613,8 @@ public abstract class AbstractReadBackedPileup { + + private Collection perSampleArtificialReadStreams; + private MergingSamRecordIterator mergingIterator; + + public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { + if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { + throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); + } + + this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; + } + + public Iterator iterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return mergingIterator; + } + + public StingSAMIterator getStingSAMIterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return StingSAMIteratorAdapter.adapt(mergingIterator); + } + + private void initialize() { + Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); + Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); + + for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { + Collection thisStreamReads = readStream.makeReads(); + + SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), + thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); + perSampleSAMReaders.add(reader); + headers.add(reader.getFileHeader()); + } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); + mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java index adf60b16b..0b5fa391d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMFileReader.java @@ -40,8 +40,11 @@ public class ArtificialSAMFileReader extends SAMFileReader { */ private final List reads; + private SAMFileHeader customHeader = null; + /** * Construct an artificial SAM file reader. + * @param sequenceDictionary sequence dictionary used to initialize our GenomeLocParser * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader(SAMSequenceDictionary sequenceDictionary,SAMRecord... reads) { @@ -50,6 +53,30 @@ public class ArtificialSAMFileReader extends SAMFileReader { this.reads = Arrays.asList(reads); } + /** + * Construct an artificial SAM file reader with the given SAM file header + * + * @param customHeader Header that should be returned by calls to getFileHeader() on this reader + * @param reads Reads to use as backing data source. + */ + public ArtificialSAMFileReader( SAMFileHeader customHeader, SAMRecord... reads ) { + super(createEmptyInputStream(),true); + + this.customHeader = customHeader; + this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary()); + this.reads = Arrays.asList(reads); + } + + + @Override + public SAMFileHeader getFileHeader() { + if ( customHeader != null ) { + return customHeader; + } + + return super.getFileHeader(); + } + /** * @{inheritDoc} */ diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index d0211db07..0859957a3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -276,6 +276,30 @@ public class ArtificialSAMUtils { return Arrays.asList(left, right); } + /** + * Create a collection of identical artificial reads based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like positional downsampling where you care only about the position and + * number of reads, and not the other attributes. + * + * @param stackSize number of identical reads to create + * @param header the SAM header to associate each read with + * @param name name associated with each read + * @param refIndex the reference index, i.e. what chromosome to associate them with + * @param alignmentStart where to start each alignment + * @param length the length of each read + * + * @return a collection of stackSize reads all sharing the above properties + */ + public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + Collection stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + /** * create an iterator containing the specified read piles * diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java new file mode 100644 index 000000000..a9480692b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; + +/** + * An artificial stream of reads from a single read group/sample with configurable characteristics + * such as: + * + * -the number of contigs that the reads should be distributed across + * -number of "stacks" of reads sharing the same alignment start position per contig + * -the min/max number of reads in each stack (exact values chosen randomly from this range) + * -the min/max distance between stack start positions (exact values chosen randomly from this range) + * -the min/max length of each read (exact values chosen randomly from this range) + * -the number of unmapped reads + * + * The cigar string for all reads will be *M, where * is the length of the read. + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStream implements Iterable { + private SAMFileHeader header; + private String readGroupID; + private int numContigs; + private int numStacksPerContig; + private int minReadsPerStack; + private int maxReadsPerStack; + private int minDistanceBetweenStacks; + private int maxDistanceBetweenStacks; + private int minReadLength; + private int maxReadLength; + private int numUnmappedReads; + + private static final String READ_GROUP_TAG = "RG"; + + public ArtificialSingleSampleReadStream( SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + this.header = header; + this.readGroupID = readGroupID; + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + this.minReadLength = minReadLength; + this.maxReadLength = maxReadLength; + this.numUnmappedReads = numUnmappedReads; + + validateStreamParameters(); + } + + private void validateStreamParameters() { + if ( header == null || readGroupID == null ) { + throw new ReviewedStingException("null SAMFileHeader or read group ID") ; + } + + if ( header.getReadGroup(readGroupID) == null ) { + throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); + } + + if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || + minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || + numUnmappedReads < 0 ) { + throw new ReviewedStingException("Read stream parameters must be >= 0"); + } + + if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { + throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); + } + + if ( minReadsPerStack > maxReadsPerStack ) { + throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack"); + } + + if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { + throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); + } + + if ( minReadLength > maxReadLength ) { + throw new ReviewedStingException("minReadLength > maxReadLength"); + } + } + + public Iterator iterator() { + return makeReads().iterator(); + } + + public StingSAMIterator getStingSAMIterator() { + return StingSAMIteratorAdapter.adapt(iterator()); + } + + public Collection makeReads() { + Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); + + for ( int contig = 0; contig < numContigs; contig++ ) { + int alignmentStart = 1; + + for ( int stack = 0; stack < numStacksPerContig; stack++ ) { + reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); + alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + if ( numUnmappedReads > 0 ) { + reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); + } + + return reads; + } + + private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { + Collection readStack = new ArrayList(stackSize); + + for ( int i = 0; i < stackSize; i++ ) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, + "foo", + contig, + alignmentStart, + MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); + read.setAttribute(READ_GROUP_TAG, readGroupID); + readStack.add(read); + } + + return readStack; + } + + public SAMFileHeader getHeader() { + return header; + } + + public String getReadGroupID() { + return readGroupID; + } + + public int getNumContigs() { + return numContigs; + } + + public int getNumStacksPerContig() { + return numStacksPerContig; + } + + public int getMinReadsPerStack() { + return minReadsPerStack; + } + + public int getMaxReadsPerStack() { + return maxReadsPerStack; + } + + public int getMinDistanceBetweenStacks() { + return minDistanceBetweenStacks; + } + + public int getMaxDistanceBetweenStacks() { + return maxDistanceBetweenStacks; + } + + public int getMinReadLength() { + return minReadLength; + } + + public int getMaxReadLength() { + return maxReadLength; + } + + public int getNumUnmappedReads() { + return numUnmappedReads; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..a4d7c5146 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.List; + +/** + * A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream. + * + * Collects various statistics about the stream of reads it's fed, and validates the stream + * by checking whether the collected statistics match the nominal properties of the stream. + * + * Subclasses are expected to override the validate() method in order to check whether an artificial + * read stream has been *transformed* in some way (eg., by downsampling or some other process), rather + * than merely checking whether the stream matches its original properties. + * + * Usage is simple: + * + * ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream); + * analyzer.analyze(originalOrTransformedStream); + * analyzer.validate(); // override this method if you want to check whether the stream has been transformed + * // in a certain way relative to the original stream + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStreamAnalyzer { + protected ArtificialSingleSampleReadStream originalStream; + protected SAMRecord lastRead; + protected int totalReads; + protected boolean allSamplesMatch; + protected int numContigs; + protected List stacksPerContig; + protected Integer minReadsPerStack; + protected Integer maxReadsPerStack; + protected Integer minDistanceBetweenStacks; + protected Integer maxDistanceBetweenStacks; + protected Integer minReadLength; + protected Integer maxReadLength; + protected int numUnmappedReads; + + protected int currentContigNumStacks; + protected int currentStackNumReads; + + /** + * Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will + * serve as the basis for comparison after the analysis is complete. + * + * @param originalStream the original ArtificialSingleSampleReadStream upon which the stream + * that will be fed to the analyzer is based + */ + public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) { + this.originalStream = originalStream; + reset(); + } + + /** + * Reset all read stream statistics collected by this analyzer to prepare for a fresh run + */ + public void reset() { + lastRead = null; + totalReads = 0; + allSamplesMatch = true; + numContigs = 0; + stacksPerContig = new ArrayList(); + minReadsPerStack = null; + maxReadsPerStack = null; + minDistanceBetweenStacks = null; + maxDistanceBetweenStacks = null; + minReadLength = null; + maxReadLength = null; + numUnmappedReads = 0; + currentContigNumStacks = 0; + currentStackNumReads = 0; + } + + /** + * Collect statistics on the stream of reads passed in + * + * @param stream the stream of reads to analyze + */ + public void analyze( Iterable stream ) { + for ( SAMRecord read : stream ) { + update(read); + } + finalizeStats(); + } + + /** + * Validate the stream by checking whether our collected statistics match the properties of the + * original stream. Throws a ReviewedStingException if the stream is invalid. + * + * Override this method if you want to check whether the stream has been transformed in some + * way relative to the original stream. + */ + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads"); + } + if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) { + throw new ReviewedStingException("stack had more than the maximum number of reads"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } + + public void update( SAMRecord read ) { + if ( read.getReadUnmappedFlag() ) { + numUnmappedReads++; + + if ( numUnmappedReads == 1 && lastRead != null ) { + processContigChange(); + numContigs--; + } + } + else if ( lastRead == null ) { + numContigs = 1; + currentContigNumStacks = 1; + currentStackNumReads = 1; + } + else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) { + processContigChange(); + } + else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) { + processStackChangeWithinContig(read); + } + else { + currentStackNumReads++; + } + + updateReadLength(read.getReadLength()); + allSamplesMatch = allSamplesMatch && readHasCorrectSample(read); + totalReads++; + + lastRead = read; + } + + + private void processContigChange() { + numContigs++; + + stacksPerContig.add(currentContigNumStacks); + currentContigNumStacks = 1; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + } + + private void processStackChangeWithinContig( SAMRecord read ) { + currentContigNumStacks++; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + + updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart()); + } + + private void updateReadsPerStack( int stackReadCount ) { + if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) { + minReadsPerStack = stackReadCount; + } + if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) { + maxReadsPerStack = stackReadCount; + } + } + + private void updateDistanceBetweenStacks( int stackDistance ) { + if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) { + minDistanceBetweenStacks = stackDistance; + } + if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) { + maxDistanceBetweenStacks = stackDistance; + } + } + + private void updateReadLength( int readLength ) { + if ( minReadLength == null || readLength < minReadLength ) { + minReadLength = readLength; + } + if ( maxReadLength == null || readLength > maxReadLength ) { + maxReadLength = readLength; + } + } + + private boolean readHasCorrectSample( SAMRecord read ) { + return originalStream.getReadGroupID().equals(read.getAttribute("RG")); + } + + public void finalizeStats() { + if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) { + stacksPerContig.add(currentContigNumStacks); + updateReadsPerStack(currentStackNumReads); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 5aeb741ec..d2bfabacf 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -29,7 +29,7 @@ import net.sf.picard.filter.FilteringIterator; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; import java.util.Collections; @@ -97,7 +98,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { }, PER_SAMPLE { @Override - DownsamplingMethod create() { return GATKArgumentCollection.getDefaultDownsamplingMethod(); } + DownsamplingMethod create() { return DownsamplingMethod.getDefaultDownsamplingMethod(new CountLoci(), false); } }; abstract DownsamplingMethod create(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 730b3f410..9df849940 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -25,36 +25,40 @@ package org.broadinstitute.sting.gatk.datasources.reads; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import org.testng.Assert; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.List; import static org.testng.Assert.*; /** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 *

    * Class SAMDataSourceUnitTest *

    @@ -66,6 +70,161 @@ public class SAMDataSourceUnitTest extends BaseTest { private IndexedFastaSequenceFile seq; private GenomeLocParser genomeLocParser; + + /*********************************** + * Tests for the fillShard() method + ***********************************/ + + /** + * Tests to ensure that the fillShard() method does not place shard boundaries at inappropriate places, + * such as within an alignment start position + */ + private static class SAMDataSourceFillShardBoundaryTest extends TestDataProvider { + private int numContigs; + private int numStacksPerContig; + private int stackSize; + private int numUnmappedReads; + private DownsamplingMethod downsamplingMethod; + + private SAMFileHeader header; + + public SAMDataSourceFillShardBoundaryTest( int numContigs, + int numStacksPerContig, + int stackSize, + int numUnmappedReads, + int downsamplingTargetCoverage ) { + super(SAMDataSourceFillShardBoundaryTest.class); + + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.stackSize = stackSize; + this.numUnmappedReads = numUnmappedReads; + + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + + setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", + getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); + } + + public void run() { + SAMDataSource dataSource = new SAMDataSource(Arrays.asList(createTestBAM()), + new ThreadAllocation(), + null, + new GenomeLocParser(header.getSequenceDictionary()), + false, + SAMFileReader.ValidationStringency.SILENT, + null, + downsamplingMethod, + new ValidationExclusion(), + new ArrayList(), + false); + + Assert.assertTrue(dataSource.usingExpandedShards()); + + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); + + SAMRecord readAtEndOfLastShard = null; + + for ( Shard shard : shardIterator ) { + int numContigsThisShard = 0; + SAMRecord lastRead = null; + + for ( SAMRecord read : shard.iterator() ) { + if ( lastRead == null ) { + numContigsThisShard = 1; + } + else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { + numContigsThisShard++; + } + + // If the last read from the previous shard is not unmapped, we have to make sure + // that no reads in this shard start at the same position + if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { + Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && + readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), + String.format("Reads from alignment start position %d:%d are split across multiple shards", + read.getReferenceIndex(), read.getAlignmentStart())); + } + + lastRead = read; + } + + // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) + Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); + + readAtEndOfLastShard = lastRead; + } + } + + private SAMReaderID createTestBAM() { + header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); + SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, + "foo", + numContigs, + numStacksPerContig, + stackSize, + stackSize, + 1, + 100, + 50, + 150, + numUnmappedReads); + + File testBAMFile; + try { + testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); + testBAMFile.deleteOnExit(); + } + catch ( IOException e ) { + throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage())); + } + + SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); + for ( SAMRecord read : artificialReads ) { + bamWriter.addAlignment(read); + } + bamWriter.close(); + + return new SAMReaderID(testBAMFile, new Tags()); + } + } + + @DataProvider(name = "SAMDataSourceFillShardTestDataProvider") + public Object[][] createSAMDataSourceFillShardBoundaryTests() { + // Take downsampling out of the equation for these tests -- we are only interested in whether the + // shard boundaries occur at the right places in the read stream, and removing downsampling as a + // factor simplifies that task (note that we still need to provide a specific downsampling method with + // experimental downsampling enabled to trigger the shard expansion behavior, for now) + int downsamplingTargetCoverage = ReadShard.MAX_READS * 10; + + for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { + for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { + // Use crucial read shard boundary values as the stack sizes + for ( int stackSize : Arrays.asList(ReadShard.MAX_READS / 2, ReadShard.MAX_READS / 2 + 10, ReadShard.MAX_READS, ReadShard.MAX_READS - 1, ReadShard.MAX_READS + 1, ReadShard.MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.MAX_READS / 2, ReadShard.MAX_READS * 2) ) { + new SAMDataSourceFillShardBoundaryTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); + } + } + } + } + + return SAMDataSourceFillShardBoundaryTest.getTests(SAMDataSourceFillShardBoundaryTest.class); + } + + // TODO: re-enable these tests once the issues with filepointer ordering + the downsamplers are worked out + @Test(dataProvider = "SAMDataSourceFillShardTestDataProvider", enabled = false) + public void testSAMDataSourceFillShard( SAMDataSourceFillShardBoundaryTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + + // TODO: the legacy tests below should really be replaced with a more comprehensive suite of tests for SAMDataSource + /** * This function does the setup of our parser, before each method call. *

    diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java index b0de78b97..b0a8ff065 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -1,73 +1,138 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Collection; +import java.util.Arrays; -public class DownsamplingReadsIteratorUnitTest { +public class DownsamplingReadsIteratorUnitTest extends BaseTest { - @Test - public void testDownsamplingIteratorWithPositionalDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class DownsamplingReadsIteratorTest extends TestDataProvider { + private DownsamplingReadsIterator downsamplingIter; + private int targetCoverage; + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; - Collection reads = new ArrayList(); + public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { + super(DownsamplingReadsIteratorTest.class); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100)); + this.stream = stream; + this.targetCoverage = targetCoverage; - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + targetCoverage, + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); } - Assert.assertEquals(count, 1000); + public void run() { + streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); + downsamplingIter = new DownsamplingReadsIterator(stream.getStingSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); + + streamAnalyzer.analyze(downsamplingIter); + + // Check whether the observed properties of the downsampled stream are what they should be + streamAnalyzer.validate(); + + // Allow memory used by this test to be reclaimed + stream = null; + streamAnalyzer = null; + downsamplingIter = null; + } } - @Test - public void testDownsamplingIteratorNoEffectiveDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") + public Object[][] createDownsamplingReadsIteratorTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); - Collection reads = new ArrayList(); + // Values that don't vary across tests + int targetCoverage = 10; + int minReadLength = 50; + int maxReadLength = 100; + int minDistanceBetweenStacks = 1; + int maxDistanceBetweenStacks = maxReadLength + 1; - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + GenomeAnalysisEngine.resetRandomGenerator(); - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + // brute force testing! + for ( int numContigs : Arrays.asList(1, 2, 5) ) { + for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { + for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { + // Only interested in sane read stream configurations here + if ( minReadsPerStack <= maxReadsPerStack ) { + new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads), + targetCoverage); + } + } + } + } + } } - Assert.assertEquals(count, 600); + return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); } - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; + @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") + public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java index 0f4bae555..3bf1096b1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -1,65 +1,157 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.List; -public class FractionalDownsamplerUnitTest { +public class FractionalDownsamplerUnitTest extends BaseTest { - @Test - public void test100PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(1.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class FractionalDownsamplerTest extends TestDataProvider { + double fraction; + int totalReads; + int expectedMinNumReadsAfterDownsampling; + int expectedMaxNumReadsAfterDownsampling; + int expectedMinDiscardedItems; + int expectedMaxDiscardedItems; - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); + private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent - List downsampledReads = downsampler.consumeDownsampledItems(); + public FractionalDownsamplerTest( double fraction, int totalReads ) { + super(FractionalDownsamplerTest.class); - Assert.assertTrue(downsampledReads.size() == 1000); - } + this.fraction = fraction; + this.totalReads = totalReads; - @Test - public void test0PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + calculateExpectations(); - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.isEmpty()); - } - - @Test - public void test50PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.5); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000); - } - - private List createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) { - List reads = new ArrayList(numReads); - - for ( int i = 1; i <= numReads; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1)); + setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", + getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); } - return reads; + private void calculateExpectations() { + // Require an exact match in the 0% and 100% cases + if ( fraction == 0.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; + expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; + } + else if ( fraction == 1.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; + expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; + } + else { + expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); + expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); + expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; + expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; + } + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "FractionalDownsamplerTestDataProvider") + public Object[][] createFractionalDownsamplerTestData() { + for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { + for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { + new FractionalDownsamplerTest(fraction, totalReads); + } + } + + return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); + } + + @Test(dataProvider = "FractionalDownsamplerTestDataProvider") + public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && + downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); + + Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && + downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..2717d014c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..b9022900b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { + + // TODO: tests should distinguish between variance across samples and variance within a sample + + private enum StreamDensity { + SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), + DENSE (1, MIN_READ_LENGTH), + MIXED (1, MAX_READ_LENGTH * 2), + UNIFORM_DENSE (1, 1), + UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); + + int minDistanceBetweenStacks; + int maxDistanceBetweenStacks; + + StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + } + + public String toString() { + return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + private enum StreamStackDepth { + NON_UNIFORM_LOW (1, 5), + NON_UNIFORM_HIGH (15, 20), + NON_UNIFORM_MIXED (1, 20), + UNIFORM_SINGLE (1, 1), + UNIFORM_LOW (2, 2), + UNIFORM_HIGH (20, 20), + UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing + + int minReadsPerStack; + int maxReadsPerStack; + + StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + } + + public boolean isUniform() { + return minReadsPerStack == maxReadsPerStack; + } + + public String toString() { + return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); + } + } + + private enum StreamStacksPerContig { + UNIFORM(20, 20), + NON_UNIFORM(1, 30); + + int minStacksPerContig; + int maxStacksPerContig; + + StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { + this.minStacksPerContig = minStacksPerContig; + this.maxStacksPerContig = maxStacksPerContig; + } + + public boolean isUniform() { + return minStacksPerContig == maxStacksPerContig; + } + + public String toString() { + return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); + } + } + + // Not interested in testing multiple ranges for the read lengths, as none of our current + // downsamplers are affected by read length + private static final int MIN_READ_LENGTH = 50; + private static final int MAX_READ_LENGTH = 150; + + private ReadsDownsamplerFactory downsamplerFactory; + private int targetCoverage; + private int numSamples; + private int minContigs; + private int maxContigs; + private StreamDensity streamDensity; + private StreamStackDepth streamStackDepth; + private StreamStacksPerContig streamStacksPerContig; + private double unmappedReadsFraction; + private int unmappedReadsCount; + private boolean verifySortedness; + + private ArtificialMultiSampleReadStream mergedReadStream; + private Map perSampleArtificialReadStreams; + private Map perSampleStreamAnalyzers; + private SAMFileHeader header; + + public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, + int targetCoverage, + int numSamples, + int minContigs, + int maxContigs, + StreamDensity streamDensity, + StreamStackDepth streamStackDepth, + StreamStacksPerContig streamStacksPerContig, + double unmappedReadsFraction, + int unmappedReadsCount, + boolean verifySortedness ) { + super(PerSampleDownsamplingReadsIteratorTest.class); + + this.downsamplerFactory = downsamplerFactory; + this.targetCoverage = targetCoverage; + this.numSamples = numSamples; + this.minContigs = minContigs; + this.maxContigs = maxContigs; + this.streamDensity = streamDensity; + this.streamStackDepth = streamStackDepth; + this.streamStacksPerContig = streamStacksPerContig; + this.unmappedReadsFraction = unmappedReadsFraction; + this.unmappedReadsCount = unmappedReadsCount; + this.verifySortedness = verifySortedness; + + header = createHeader(); + createReadStreams(); + + setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", + getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); + } + + private SAMFileHeader createHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); + List readGroups = new ArrayList(numSamples); + List sampleNames = new ArrayList(numSamples); + + for ( int i = 0; i < numSamples; i++ ) { + readGroups.add("ReadGroup" + i); + sampleNames.add("Sample" + i); + } + + return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); + } + + private void createReadStreams() { + perSampleArtificialReadStreams = new HashMap(numSamples); + perSampleStreamAnalyzers = new HashMap(numSamples); + + for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { + String readGroupID = readGroup.getReadGroupId(); + String sampleName = readGroup.getSample(); + + int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); + int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); + + int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; + + ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, + readGroupID, + thisSampleNumContigs, + thisSampleStacksPerContig, + streamStackDepth.minReadsPerStack, + streamStackDepth.maxReadsPerStack, + streamDensity.minDistanceBetweenStacks, + streamDensity.maxDistanceBetweenStacks, + MIN_READ_LENGTH, + MAX_READ_LENGTH, + thisSampleNumUnmappedReads); + perSampleArtificialReadStreams.put(sampleName, thisSampleStream); + perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); + } + + mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); + } + + public void run() { + StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory); + + if ( verifySortedness ) { + downsamplingIter = new VerifyingSamIterator(downsamplingIter); + } + + while ( downsamplingIter.hasNext() ) { + SAMRecord read = downsamplingIter.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); + if ( analyzer != null ) { + analyzer.update(read); + } + else { + throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found"); + } + } + + for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { + ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); + analyzer.finalizeStats(); + + // Validate the downsampled read stream for each sample individually + analyzer.validate(); + } + + // Allow memory used by this test to be reclaimed: + mergedReadStream = null; + perSampleArtificialReadStreams = null; + perSampleStreamAnalyzers = null; + } + } + + @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public Object[][] createPerSampleDownsamplingReadsIteratorTests() { + + GenomeAnalysisEngine.resetRandomGenerator(); + + // Some values don't vary across tests + int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; + ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); + int maxContigs = 3; + boolean verifySortedness = true; + + for ( int numSamples : Arrays.asList(1, 2, 10) ) { + for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { + for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { + for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { + for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { + new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, + targetCoverage, + numSamples, + minContigs, + maxContigs, + streamDensity, + streamStackDepth, + streamStacksPerContig, + unmappedReadsFraction, + unmappedReadsCount, + verifySortedness); + } + } + } + } + } + } + } + + return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java deleted file mode 100644 index b1d8e45c9..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java +++ /dev/null @@ -1,357 +0,0 @@ -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - -// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested -public class PositionalDownsamplerUnitTest extends BaseTest { - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeNonOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100)); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 1000); - Assert.assertTrue(downsampledStackSizes.get(1) == 1000); - Assert.assertTrue(downsampledStackSizes.get(2) == 1000); - } - - /** - * --- - * --- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackAtBeginning() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * --- - * --- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackInMiddle() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------ - * ------ - * ------- - * ------- - * --- - * --- - */ - @Test - public void testThreeStacksWithShortStackAtEnd() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ---- - * ------- - * ---- - * ------- - * ------- - */ - @Test - public void testThreePartiallyOverlappingStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - - // TODO: need to examine per-base coverage here - } - - @Test - public void testNoDownsamplingRequired() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 300); - Assert.assertTrue(downsampledStackSizes.get(1) == 300); - Assert.assertTrue(downsampledStackSizes.get(2) == 300); - } - - @Test - public void testGATKSAMRecordSupport() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - List reads = new ArrayList(); - for ( int i = 0; i < 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() == 10); - } - - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; - } - - private ArrayList createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) { - ArrayList stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength); - stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength)); - return stack; - } - - private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { - List stackSizes = new ArrayList(); - Iterator iter = downsampledReads.iterator(); - Assert.assertTrue(iter.hasNext()); - - SAMRecord previousRead = iter.next(); - int currentStackSize = 1; - - while ( iter.hasNext() ) { - SAMRecord currentRead = iter.next(); - - if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { - stackSizes.add(currentStackSize); - currentStackSize = 1; - } - else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { - Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); - } - else { - currentStackSize++; - } - - previousRead = currentRead; - } - - stackSizes.add(currentStackSize); - return stackSizes; - } -} - diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..9cbd0db8a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; + +/** + * Class for analyzing an artificial read stream that has been positionally downsampled, and verifying + * that the downsampling was done correctly without changing the stream in unexpected ways. + * + * @author David Roazen + */ +public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer { + private int targetCoverage; + + public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) { + super(originalStream); + this.targetCoverage = targetCoverage; + } + + /** + * Overridden validate() method that checks for the effects of positional downsampling in addition to checking + * for whether the original properties of the stream not affected by downsampling have been preserved + */ + @Override + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + + // Check for the effects of positional downsampling: + int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack()); + int stackMaximumAfterDownsampling = targetCoverage; + + if ( minReadsPerStack < stackMinimumAfterDownsampling ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling"); + } + if ( maxReadsPerStack > stackMaximumAfterDownsampling ) { + throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..75d0448c4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..5dc41b4a0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + GenomeAnalysisEngine.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java new file mode 100644 index 000000000..c148bcf84 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java @@ -0,0 +1,546 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * testing of the experimental version of LocusIteratorByState + */ +public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { + private static SAMFileHeader header; + private LocusIteratorByStateExperimental li; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + private final LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + } + + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { + return new ReadProperties( + Collections.emptyList(), + new SAMFileHeader(), + false, + SAMFileReader.ValidationStringency.STRICT, + downsamplingMethod, + new ValidationExclusion(), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte) -1 + ); + } + + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; + + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() { + return; + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } + } + + @Test + public void testXandEQOperators() { + final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); + r1.setReadBases(bases1); + r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r1.setCigarString("10M"); + + SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); + r2.setReadBases(bases2); + r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r2.setCigarString("3=1X5=1X"); + + SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); + r3.setReadBases(bases2); + r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r3.setCigarString("3=1X5M1X"); + + SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); + r4.setReadBases(bases2); + r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r4.setCigarString("10M"); + + List reads = Arrays.asList(r1, r2, r3, r4); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 4); + } + } + + @Test + public void testIndelsInRegularPileup() { + final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); + before.setReadBases(bases); + before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + before.setCigarString("10M"); + + SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); + during.setReadBases(indelBases); + during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + during.setCigarString("4M2I6M"); + + SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); + after.setReadBases(bases); + after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + after.setCigarString("10M"); + + List reads = Arrays.asList(before, during, after); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + boolean foundIndel = false; + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); + for (PileupElement p : pileup) { + if (p.isBeforeInsertion()) { + foundIndel = true; + Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); + Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); + break; + } + } + + } + + Assert.assertTrue(foundIndel,"Indel in pileup not found"); + } + + @Test + public void testWholeIndelReadInIsolation() { + final int firstLocus = 44367789; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); + indelOnlyRead.setCigarString("76I"); + + List reads = Arrays.asList(indelOnlyRead); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, readAttributes); + + // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read + // and considers it to be an indel-containing read. + Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); + ReadBackedPileup basePileup = alignmentContext.getBasePileup(); + Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test + public void testWholeIndelRead() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); + leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); + leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + leadingRead.setCigarString("1M75I"); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setCigarString("76I"); + + SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); + fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); + fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); + fullMatchAfterIndel.setCigarString("75I1M"); + + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + int currentLocus = firstLocus; + int numAlignmentContextsFound = 0; + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); + + if(currentLocus == firstLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); + } + else if(currentLocus == secondLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); + } + + currentLocus++; + numAlignmentContextsFound++; + } + + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly + */ + @Test + public void testWholeIndelReadRepresentedTest() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); + + List reads = Arrays.asList(read1); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } + } + + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static final int IS_BEFORE_DELETED_BASE_FLAG = 1; + private static final int IS_BEFORE_DELETION_START_FLAG = 2; + private static final int IS_AFTER_DELETED_BASE_FLAG = 4; + private static final int IS_AFTER_DELETION_END_FLAG = 8; + private static final int IS_BEFORE_INSERTION_FLAG = 16; + private static final int IS_AFTER_INSERTION_FLAG = 32; + private static final int IS_NEXT_TO_SOFTCLIP_FLAG = 64; + + private static class LIBSTest { + + + final String cigar; + final int readLength; + final List offsets; + final List flags; + + private LIBSTest(final String cigar, final int readLength, final List offsets, final List flags) { + this.cigar = cigar; + this.readLength = readLength; + this.offsets = offsets; + this.flags = flags; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + return new Object[][]{ + {new LIBSTest("1I", 1, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("10I", 10, Arrays.asList(0), Arrays.asList(IS_BEFORE_INSERTION_FLAG))}, + {new LIBSTest("2M2I2M", 6, Arrays.asList(0,1,4,5), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG,IS_AFTER_INSERTION_FLAG,0))}, + {new LIBSTest("2M2I", 4, Arrays.asList(0,1), Arrays.asList(0,IS_BEFORE_INSERTION_FLAG))}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + {new LIBSTest("1M2D2M", 3, Arrays.asList(0,1,2), Arrays.asList(IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG,0))}, + {new LIBSTest("1S1M", 2, Arrays.asList(1), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1M1S", 2, Arrays.asList(0), Arrays.asList(IS_NEXT_TO_SOFTCLIP_FLAG))}, + {new LIBSTest("1S1M1I", 3, Arrays.asList(1), Arrays.asList(IS_BEFORE_INSERTION_FLAG | IS_NEXT_TO_SOFTCLIP_FLAG))} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + + int offset = 0; + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + final int flag = params.flags.get(offset); + Assert.assertEquals(pe.isBeforeDeletedBase(), (flag & IS_BEFORE_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isBeforeDeletionStart(), (flag & IS_BEFORE_DELETION_START_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletedBase(), (flag & IS_AFTER_DELETED_BASE_FLAG) != 0); + Assert.assertEquals(pe.isAfterDeletionEnd(), (flag & IS_AFTER_DELETION_END_FLAG) != 0); + Assert.assertEquals(pe.isBeforeInsertion(), (flag & IS_BEFORE_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isAfterInsertion(), (flag & IS_AFTER_INSERTION_FLAG) != 0); + Assert.assertEquals(pe.isNextToSoftClip(), (flag & IS_NEXT_TO_SOFTCLIP_FLAG) != 0); + + Assert.assertEquals(pe.getOffset(), params.offsets.get(offset).intValue()); + + offset++; + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByStateExperimental.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java index 3b5d8d6b7..f0d7f83dc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java @@ -28,14 +28,12 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -48,7 +46,6 @@ import java.util.List; */ public class VerifyingSamIteratorUnitTest { private SAMFileHeader samFileHeader; - private GenomeLocParser genomeLocParser; @BeforeClass public void init() { @@ -58,8 +55,6 @@ public class VerifyingSamIteratorUnitTest { samFileHeader = new SAMFileHeader(); samFileHeader.setSequenceDictionary(sequenceDictionary); - - genomeLocParser = new GenomeLocParser(sequenceDictionary); } @Test @@ -68,7 +63,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -83,7 +78,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -98,7 +93,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -116,7 +111,7 @@ public class VerifyingSamIteratorUnitTest { read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); diff --git a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 0f19e2f90..5b052454a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -17,7 +17,7 @@ import java.util.*; * @author mhanna * @version 0.1 */ -public class ReservoirDownsamplerUnitTest { +public class LegacyReservoirDownsamplerUnitTest { private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java new file mode 100644 index 000000000..74626d031 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java @@ -0,0 +1,161 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import org.broadinstitute.sting.BaseTest; + +public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { + + private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { + super(ArtificialSingleSampleReadStreamTest.class); + + this.stream = stream; + + setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); + + streamAnalyzer.analyze(stream); + + // Check whether the observed properties of the stream match its nominal properties + streamAnalyzer.validate(); + } + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") + public Object[][] createArtificialSingleSampleReadStreamTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + GenomeAnalysisEngine.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { + for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { + for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { + for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { + for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { + for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { + for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { + for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { + for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { + // Only test sane combinations here + if ( minReadsPerStack <= maxReadsPerStack && + minDistanceBetweenStacks <= maxDistanceBetweenStacks && + minReadLength <= maxReadLength && + ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { + + new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads)); + } + } + } + } + } + } + } + } + } + } + + return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") + public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") + public Object[][] createInvalidArgumentsTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + header.addReadGroup(new SAMReadGroupRecord(readGroupID)); + + return new Object[][] { + {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, + {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, + {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, + {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, + {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, + {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, + {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, + {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, + {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, + {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, + }; + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", + expectedExceptions = ReviewedStingException.class) + public void testInvalidArguments( String testName, + SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + + logger.warn("Running test: " + testName); + + ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + numStacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads); + } +} From 576c7280d9b0ebc9b6f73e89cc394cb7fde23623 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 6 Sep 2012 22:03:18 -0400 Subject: [PATCH 38/45] Extensions to the ErrorThrowing framework for testing purposes --- .../sting/gatk/CommandLineGATK.java | 20 ++++----- .../sting/gatk/walkers/qc/ErrorThrowing.java | 44 +++++++++++++++---- .../sting/utils/exceptions/UserException.java | 6 +++ 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 312d31727..ce57d1a7a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -112,31 +112,31 @@ public class CommandLineGATK extends CommandLineExecutable { } } - protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; - protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; + public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; private static void checkForMaskedUserErrors(final Throwable t) { final String message = t.getMessage(); if ( message == null ) return; // we know what to do about the common "Too many open files" error - if ( message.indexOf("Too many open files") != -1 ) + if ( message.contains("Too many open files") ) exitSystemWithUserError(new UserException.TooManyOpenFiles()); // malformed BAM looks like a SAM file - if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 || - message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 ) + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || + message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) exitSystemWithSamError(t); // can't close tribble index when writing - if ( message.indexOf("Unable to close index for") != -1 ) + if ( message.contains("Unable to close index for") ) exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); // disk is full - if ( message.indexOf("No space left on device") != -1 ) - exitSystemWithUserError(new UserException(t.getMessage())); - if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 ) - exitSystemWithUserError(new UserException(t.getCause().getMessage())); + if ( message.contains("No space left on device") ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); + if ( t.getCause() != null && t.getCause().getMessage().contains("No space left on device") ) + exitSystemWithUserError(new UserException.NoSpaceOnDevice()); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java index a3df3bc13..12423595b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -45,20 +46,23 @@ public class ErrorThrowing extends RodWalker implements TreeRed @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) public String exceptionToThrow; + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE + } + // // Template code to allow us to build the walker, doesn't actually do anything // @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( exceptionToThrow.equals("UserException") ) { - throw new UserException("UserException"); - } else if ( exceptionToThrow.equals("NullPointerException") ) { - throw new NullPointerException(); - } else if ( exceptionToThrow.equals("ReviewedStingException") ) { - throw new ReviewedStingException("ReviewedStingException"); - } else { - throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); - } + if ( failMethod == FailMethod.MAP ) + fail(); + return 0; } @Override @@ -68,10 +72,32 @@ public class ErrorThrowing extends RodWalker implements TreeRed @Override public Integer reduce(Integer value, Integer sum) { + if ( failMethod == FailMethod.REDUCE ) + fail(); return value + sum; } public Integer treeReduce(final Integer lhs, final Integer rhs) { + if ( failMethod == FailMethod.TREE_REDUCE ) + fail(); return lhs + rhs; } + + private void fail() { + if ( exceptionToThrow.equals("UserException") ) { + throw new UserException("UserException"); + } else if ( exceptionToThrow.equals("NullPointerException") ) { + throw new NullPointerException(); + } else if ( exceptionToThrow.equals("ReviewedStingException") ) { + throw new ReviewedStingException("ReviewedStingException"); + } else if ( exceptionToThrow.equals("SamError1") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + } else if ( exceptionToThrow.equals("SamError2") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + } else if ( exceptionToThrow.equals("NoSpace") ) { + throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else { + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 47a2f2f1d..faafc611a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -141,6 +141,12 @@ public class UserException extends ReviewedStingException { } } + public static class NoSpaceOnDevice extends UserException { + public NoSpaceOnDevice() { + super("There is no space left on the device, so writing failed"); + } + } + public static class CouldNotReadInputFile extends UserException { public CouldNotReadInputFile(String message, Exception e) { super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); From 9d12935986c4ded5e60274c5d13a2383678ef0e4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 6 Sep 2012 14:33:31 -0400 Subject: [PATCH 39/45] Intermediate commit for new hyper parallel NanoScheduler -- There's a logic bug now but I'll go to squash it... --- .../utils/nanoScheduler/NanoScheduler.java | 263 ++++++++++++------ .../utils/threading/NamedThreadFactory.java | 26 ++ .../nanoScheduler/NanoSchedulerUnitTest.java | 6 + 3 files changed, 207 insertions(+), 88 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 24db0f7dc..fe8731d3b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -5,13 +5,11 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.threading.NamedThreadFactory; import java.util.Iterator; -import java.util.LinkedList; import java.util.List; -import java.util.Queue; import java.util.concurrent.*; /** @@ -52,7 +50,9 @@ public class NanoScheduler { final int bufferSize; final int nThreads; + final ExecutorService inputExecutor; + final ExecutorService reduceExecutor; final ExecutorService mapExecutor; boolean shutdown = false; boolean debug = false; @@ -77,8 +77,14 @@ public class NanoScheduler { this.bufferSize = bufferSize; this.nThreads = nThreads; - this.mapExecutor = nThreads == 1 ? null : Executors.newFixedThreadPool(nThreads-1); - this.inputExecutor = Executors.newSingleThreadExecutor(); + + if ( nThreads == 1 ) { + this.mapExecutor = this.inputExecutor = this.reduceExecutor = null; + } else { + this.mapExecutor = Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); + this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d")); + } // start timing the time spent outside of the nanoScheduler outsideSchedulerTimer.start(); @@ -110,11 +116,9 @@ public class NanoScheduler { public void shutdown() { outsideSchedulerTimer.stop(); - if ( mapExecutor != null ) { - final List remaining = mapExecutor.shutdownNow(); - if ( ! remaining.isEmpty() ) - throw new IllegalStateException("Remaining tasks found in the mapExecutor, unexpected behavior!"); - } + shutdownExecutor("inputExecutor", inputExecutor); + shutdownExecutor("mapExecutor", mapExecutor); + shutdownExecutor("reduceExecutor", reduceExecutor); shutdown = true; if (TIME_CALLS) { @@ -125,6 +129,31 @@ public class NanoScheduler { } } + /** + * Helper function to cleanly shutdown an execution service, checking that the execution + * state is clean when it's done. + * + * @param name a string name for error messages for the executorService we are shutting down + * @param executorService the executorService to shut down + */ + private void shutdownExecutor(final String name, final ExecutorService executorService) { + if ( executorService != null ) { + if ( executorService.isShutdown() || executorService.isTerminated() ) + throw new IllegalStateException("Executor service " + name + " is already shut down!"); + + final List remaining = executorService.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); + } + } + + /** + * Print to logger.info timing information from timer, with name label + * + * @param label the name of the timer to display. Should be human readable + * @param timer the timer whose elapsed time we will display + */ + @Requires({"label != null", "timer != null"}) private void printTimerInfo(final String label, final SimpleTimer timer) { final double total = inputTimer.getElapsedTime() + mapTimer.getElapsedTime() + reduceTimer.getElapsedTime() + outsideSchedulerTimer.getElapsedTime(); @@ -140,16 +169,30 @@ public class NanoScheduler { return shutdown; } + /** + * @return are we displaying verbose debugging information about the scheduling? + */ public boolean isDebug() { return debug; } + /** + * Helper function to display a String.formatted message if we are doing verbose debugging + * + * @param format the format argument suitable for String.format + * @param args the arguments for String.format + */ + @Requires("format != null") private void debugPrint(final String format, Object ... args) { if ( isDebug() ) logger.info("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args)); } - + /** + * Turn on/off verbose debugging + * + * @param debug true if we want verbose debugging + */ public void setDebug(boolean debug) { this.debug = debug; } @@ -179,6 +222,9 @@ public class NanoScheduler { * It is safe to call this function repeatedly on a single nanoScheduler, at least until the * shutdown method is called. * + * Note that this function goes through a single threaded fast path if the number of threads + * is 1. + * * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over * @param map the map function from input type -> map type, will be applied in parallel to each input * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results @@ -207,9 +253,11 @@ public class NanoScheduler { } /** - * Simple efficient reference implementation for single threaded execution + * Simple efficient reference implementation for single threaded execution. + * * @return the reduce result of this map/reduce job */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeSingleThreaded(final Iterator inputReader, final NanoSchedulerMapFunction map, final ReduceType initialValue, @@ -249,88 +297,111 @@ public class NanoScheduler { * * @return the reduce result of this map/reduce job */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeMultiThreaded(final Iterator inputReader, final NanoSchedulerMapFunction map, final ReduceType initialValue, final NanoSchedulerReduceFunction reduce) { debugPrint("Executing nanoScheduler"); - ReduceType sum = initialValue; - boolean done = false; + // a completion service that tracks when jobs complete, so we can wait in this thread + // until all of the map jobs are completed, without having to shut down the executor itself + final ExecutorCompletionService mapJobCompletionService = + new ExecutorCompletionService(mapExecutor); + + // a blocking queue that limits the number of input datum to the requested buffer size final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); + // a priority queue that stores up to bufferSize * MAP_QUEUE_SCALE_FACTOR elements + // produced by completed map jobs. + final PriorityBlockingQueue mapResultQueue = new PriorityBlockingQueue(bufferSize*100); + + // TODO -- the logic of this blocking queue is wrong! We need to wait for map jobs in order, not just + // -- in the order in which they are produced + + // TODO -- map executor must have fixed size map jobs queue + inputExecutor.submit(new InputProducer(inputReader, inputQueue)); + final Future reduceResult = reduceExecutor.submit(new ReducerThread(reduce, initialValue, mapResultQueue)); - while ( ! done ) { - try { - final Pair, Boolean> readResults = readInputs(inputQueue); - final List inputs = readResults.getFirst(); - done = readResults.getSecond(); + try { + int numJobs = 0; + while ( true ) { + // block on input + final InputDatum inputEnqueueWrapped = inputQueue.take(); - if ( ! inputs.isEmpty() ) { - // send jobs for map - final Queue> mapQueue = submitMapJobs(map, mapExecutor, inputs); + if ( ! inputEnqueueWrapped.isLast() ) { + // get the object itself + final InputType input = inputEnqueueWrapped.datum; + + // the next map call has id + 1 + numJobs++; + + // send job for map via the completion service + final CallableMap doMap = new CallableMap(map, numJobs, input, mapResultQueue); + mapJobCompletionService.submit(doMap, numJobs); - // send off the reduce job, and block until we get at least one reduce result - sum = reduceSerial(reduce, mapQueue, sum); debugPrint(" Done with cycle of map/reduce"); - if ( progressFunction != null ) progressFunction.progress(inputs.get(inputs.size()-1)); + if ( progressFunction != null ) // TODO -- don't cycle so often + progressFunction.progress(input); } else { - // we must be done - if ( ! done ) throw new IllegalStateException("Inputs empty but not done"); + waitForLastJob(mapJobCompletionService, numJobs); + mapResultQueue.add(new MapResult()); + return reduceResult.get(); // wait for our result of reduce } - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } catch (ExecutionException ex) { - throw new ReviewedStingException("got execution exception", ex); } + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); } - - return sum; - } - - @Requires({"reduce != null", "! mapQueue.isEmpty()"}) - private ReduceType reduceSerial(final NanoSchedulerReduceFunction reduce, - final Queue> mapQueue, - final ReduceType initSum) - throws InterruptedException, ExecutionException { - ReduceType sum = initSum; - - // while mapQueue has something in it to reduce - for ( final Future future : mapQueue ) { - final MapType value = future.get(); // block until we get the values for this task - - if ( TIME_CALLS ) reduceTimer.restart(); - sum = reduce.apply(value, sum); - if ( TIME_CALLS ) reduceTimer.stop(); - } - - return sum; } /** - * Read up to inputBufferSize elements from inputReader - * - * @return a queue of input read in, containing one or more values of InputType read in + * Helper routine that will wait until the last map job finishes running + * by taking numJob values from the executor completion service, using + * the blocking take() call. */ - @Requires("inputReader != null") - @Ensures("result != null") - private Pair, Boolean> readInputs(final BlockingQueue inputReader) throws InterruptedException { - int n = 0; - final List inputs = new LinkedList(); - boolean done = false; + private void waitForLastJob(final ExecutorCompletionService mapJobCompletionService, + final int numJobs ) throws InterruptedException { + for ( int i = 0; i < numJobs; i++ ) + mapJobCompletionService.take(); + } - while ( ! done && n < getBufferSize() ) { - final InputDatum input = inputReader.take(); - done = input.isLast(); - if ( ! done ) { - inputs.add(input.datum); - n++; - } + private class ReducerThread implements Callable { + final NanoSchedulerReduceFunction reduce; + ReduceType sum; + final PriorityBlockingQueue mapResultQueue; + + public ReducerThread(final NanoSchedulerReduceFunction reduce, + final ReduceType sum, + final PriorityBlockingQueue mapResultQueue) { + this.reduce = reduce; + this.sum = sum; + this.mapResultQueue = mapResultQueue; } - return new Pair, Boolean>(inputs, done); + public ReduceType call() { + try { + while ( true ) { + final MapResult result = mapResultQueue.take(); + //System.out.println("Reduce of map result " + result.id + " with sum " + sum); + if ( result.isLast() ) { + //System.out.println("Saw last! " + result.id); + return sum; + } + else { + if ( TIME_CALLS ) reduceTimer.restart(); + sum = reduce.apply(result.datum, sum); + if ( TIME_CALLS ) reduceTimer.stop(); + } + } + } catch (InterruptedException ex) { + //System.out.println("Interrupted"); + throw new ReviewedStingException("got execution exception", ex); + } + } } private class InputProducer implements Runnable { @@ -359,16 +430,16 @@ public class NanoScheduler { } } - private class InputDatum { + private class BlockingDatum { final boolean isLast; - final InputType datum; + final T datum; - private InputDatum(final InputType datum) { + private BlockingDatum(final T datum) { isLast = false; this.datum = datum; } - private InputDatum() { + private BlockingDatum() { isLast = true; this.datum = null; } @@ -378,40 +449,56 @@ public class NanoScheduler { } } - @Requires({"map != null", "! inputs.isEmpty()"}) - private Queue> submitMapJobs(final NanoSchedulerMapFunction map, - final ExecutorService executor, - final List inputs) { - final Queue> mapQueue = new LinkedList>(); - for ( final InputType input : inputs ) { - final CallableMap doMap = new CallableMap(map, input); - final Future future = executor.submit(doMap); - mapQueue.add(future); + private class InputDatum extends BlockingDatum { + private InputDatum(InputType datum) { super(datum); } + private InputDatum() { } + } + + private class MapResult extends BlockingDatum implements Comparable { + final Integer id; + + private MapResult(MapType datum, Integer id) { + super(datum); + this.id = id; } - return mapQueue; + private MapResult() { + this.id = Integer.MAX_VALUE; + } + + @Override + public int compareTo(MapResult o) { + return id.compareTo(o.id); + } } /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Callable { + private class CallableMap implements Runnable { + final int id; final InputType input; final NanoSchedulerMapFunction map; + final PriorityBlockingQueue mapResultQueue; @Requires({"map != null"}) - private CallableMap(final NanoSchedulerMapFunction map, final InputType inputs) { - this.input = inputs; + private CallableMap(final NanoSchedulerMapFunction map, + final int id, + final InputType input, + final PriorityBlockingQueue mapResultQueue) { + this.id = id; + this.input = input; this.map = map; + this.mapResultQueue = mapResultQueue; } - @Override public MapType call() throws Exception { + @Override public void run() { if ( TIME_CALLS ) mapTimer.restart(); if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); if ( TIME_CALLS ) mapTimer.stop(); - return result; + mapResultQueue.add(new MapResult(result, id)); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java new file mode 100644 index 000000000..b25375b87 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.utils.threading; + +import java.util.concurrent.ThreadFactory; + +/** + * Thread factor that produces threads with a given name pattern + * + * User: depristo + * Date: 9/5/12 + * Time: 9:22 PM + * + */ +public class NamedThreadFactory implements ThreadFactory { + static int id = 0; + final String format; + + public NamedThreadFactory(String format) { + this.format = format; + String.format(format, id); // test the name + } + + @Override + public Thread newThread(Runnable r) { + return new Thread(r, String.format(format, id++)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index ddfc3cecd..21ac6dcec 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; +import org.apache.log4j.BasicConfigurator; import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -165,6 +166,10 @@ public class NanoSchedulerUnitTest extends BaseTest { } public static void main(String [ ] args) { + org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); + BasicConfigurator.configure(); + logger.setLevel(org.apache.log4j.Level.DEBUG); + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1])); final NanoScheduler nanoScheduler = new NanoScheduler(test.bufferSize, test.nThreads); @@ -172,5 +177,6 @@ public class NanoSchedulerUnitTest extends BaseTest { final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); + nanoScheduler.shutdown(); } } From c5038849581c97b8dfc0bfd35723553ec1ad20c9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 08:57:35 -0400 Subject: [PATCH 40/45] GSA-515 Nanoscheduler GSA-551 / Optimize nanoScheduling performance of UnifiedGenotyper -- I've rewritten the entire NS framework to use a producer / consumer model for input -> map and from map -> reduce. This is allowing us to scale reasonably efficiently up to 4 threads (see figure). Future work on the nano scheduler will be itemized in a separate JIRA entry. -- Restructured the NS code for clarity. Docs everywhere. -- This is considered version 1.0 --- .../gatk/traversals/TraverseLociNano.java | 14 +- .../gatk/traversals/TraverseReadsNano.java | 10 +- .../nanoScheduler/BlockingQueueValue.java | 82 ++++++ .../utils/nanoScheduler/FutureValue.java | 45 +++ .../utils/nanoScheduler/InputProducer.java | 62 +++++ .../sting/utils/nanoScheduler/MapResult.java | 36 +++ ...lerMapFunction.java => NSMapFunction.java} | 2 +- ...sFunction.java => NSProgressFunction.java} | 2 +- ...uceFunction.java => NSReduceFunction.java} | 2 +- .../utils/nanoScheduler/NanoScheduler.java | 262 +++++------------- .../utils/nanoScheduler/ReducerThread.java | 64 +++++ .../nanoScheduler/NanoSchedulerUnitTest.java | 8 +- 12 files changed, 383 insertions(+), 206 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{NanoSchedulerMapFunction.java => NSMapFunction.java} (84%) rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{NanoSchedulerProgressFunction.java => NSProgressFunction.java} (81%) rename public/java/src/org/broadinstitute/sting/utils/nanoScheduler/{NanoSchedulerReduceFunction.java => NSReduceFunction.java} (87%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java index 73b73c002..e4e2254d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -8,10 +8,10 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReferenceOrderedView; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerProgressFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import java.util.Iterator; @@ -153,7 +153,7 @@ public class TraverseLociNano extends TraverseLociBase { * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseLociMap implements NanoSchedulerMapFunction { + private class TraverseLociMap implements NSMapFunction { final LocusWalker walker; private TraverseLociMap(LocusWalker walker) { @@ -174,11 +174,11 @@ public class TraverseLociNano extends TraverseLociBase { } /** - * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseLociReduce implements NanoSchedulerReduceFunction { + private class TraverseLociReduce implements NSReduceFunction { final LocusWalker walker; private TraverseLociReduce(LocusWalker walker) { @@ -195,7 +195,7 @@ public class TraverseLociNano extends TraverseLociBase { } } - private class TraverseLociProgress implements NanoSchedulerProgressFunction { + private class TraverseLociProgress implements NSProgressFunction { @Override public void progress(MapData lastProcessedMap) { if (lastProcessedMap.alignmentContext != null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index 5679747e1..b3a0a1390 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -35,9 +35,9 @@ import org.broadinstitute.sting.gatk.datasources.reads.ReadShard; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerMapFunction; -import org.broadinstitute.sting.utils.nanoScheduler.NanoSchedulerReduceFunction; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.LinkedList; @@ -191,7 +191,7 @@ public class TraverseReadsNano extends TraversalEngine, * * Applies walker.map to MapData, returning a MapResult object containing the result */ - private class TraverseReadsMap implements NanoSchedulerMapFunction { + private class TraverseReadsMap implements NSMapFunction { final ReadWalker walker; private TraverseReadsMap(ReadWalker walker) { @@ -211,11 +211,11 @@ public class TraverseReadsNano extends TraversalEngine, } /** - * NanoSchedulerReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements * * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable */ - private class TraverseReadsReduce implements NanoSchedulerReduceFunction { + private class TraverseReadsReduce implements NSReduceFunction { final ReadWalker walker; private TraverseReadsReduce(ReadWalker walker) { diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java new file mode 100644 index 000000000..2daa6c9eb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/BlockingQueueValue.java @@ -0,0 +1,82 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Invariant; + +/** + * Wrapper to hold data for a blocking queue, distinguishing an EOF marker from a real object + * + * The only way to tell in a consumer thread that a blocking queue has no more data ever + * coming down the pipe is to pass in a "poison" or EOF object. This class provides + * a generic capacity for that... + * + * The use case looks like this: + * + * BlockingQueue q + * producer: + * while ( x has items ) + * q.put(new BlockingQueueValue(x)) + * q.put(new BlockingQueueValue()) + * + * Consumer: + * while ( true ) + * value = q.take() + * if ( value.isLast() ) + * break + * else + * do something useful with value + * + * + * User: depristo + * Date: 9/6/12 + * Time: 3:08 PM + */ +@Invariant("! isLast || value == null") +class BlockingQueueValue { + /** + * True if this is the EOF marker object + */ + final private boolean isLast; + + /** + * Our value, if we aren't the EOF marker + */ + final private T value; + + /** + * Create a new BlockingQueueValue containing a real value, where last is false + * @param value + */ + BlockingQueueValue(final T value) { + isLast = false; + this.value = value; + } + + /** + * Create a new BlockingQueueValue that is the last item + */ + BlockingQueueValue() { + isLast = true; + this.value = null; + } + + /** + * Is this the EOF marker? + * + * @return true if so, else false + */ + public boolean isLast() { + return isLast; + } + + /** + * Get the value held by this BlockingQueueValue + * + * @return the value + * @throws IllegalStateException if this is the last item + */ + public T getValue() { + if ( isLast() ) + throw new IllegalStateException("Cannot get value for last object"); + return value; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java new file mode 100644 index 000000000..9508a15aa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/FutureValue.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +/** + * Create a future that simply returns a given value + * + * The only standard way to create a future in java is via the ExecutorService interface. + * If you have a data structure holding futures of value T, and you want to add a + * value to it for some reason (to add a EOF marker, for instance) you can use this + * class to create a dummy Future that simply returns a value. + * + * @author depristo + * @since 09/12 + */ +class FutureValue implements Future { + final V value; + + FutureValue(final V value) { + this.value = value; + } + + @Override public boolean cancel(boolean mayInterruptIfRunning) { + return true; + } + + @Override public boolean isCancelled() { + return false; + } + + @Override public boolean isDone() { + return true; + } + + @Override public V get() throws InterruptedException, ExecutionException { + return value; + } + + @Override public V get(long timeout, TimeUnit unit) throws InterruptedException, ExecutionException, TimeoutException { + return get(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java new file mode 100644 index 000000000..29dddbc49 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.Iterator; +import java.util.concurrent.BlockingQueue; + +/** + * Producer Thread that reads input values from an inputReads and puts them into a BlockingQueue + */ +class InputProducer implements Runnable { + /** + * The iterator we are using to get data from + */ + final Iterator inputReader; + + /** + * Our timer (may be null) that we use to track our input costs + */ + final SimpleTimer inputTimer; + + /** + * Where we put our input values for consumption + */ + final BlockingQueue outputQueue; + + public InputProducer(final Iterator inputReader, + final SimpleTimer inputTimer, + final BlockingQueue outputQueue) { + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); + + this.inputReader = inputReader; + this.inputTimer = inputTimer; + this.outputQueue = outputQueue; + } + + public void run() { + try { + while ( inputReader.hasNext() ) { + if ( inputTimer != null ) inputTimer.restart(); + final InputType input = inputReader.next(); + if ( inputTimer != null ) inputTimer.stop(); + outputQueue.put(new InputValue(input)); + } + + // add the EOF object so our consumer knows we are done in all inputs + outputQueue.put(new InputValue()); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } + + /** + * Helper class that contains a read value suitable for EOF marking in a BlockingQueue + */ + class InputValue extends BlockingQueueValue { + private InputValue(InputType datum) { super(datum); } + private InputValue() { } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java new file mode 100644 index 000000000..3cc6fa786 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Holds the results of a map job suitable for producer/consumer threading + * via a BlockingQueue + */ +class MapResult extends BlockingQueueValue { + final int jobID; + + /** + * Create a new MapResult with value datum and jod jobID ID + * + * @param datum the value produced by the map job + * @param jobID the id of the map job (for correctness testing) + */ + MapResult(final MapType datum, final int jobID) { + super(datum); + this.jobID = jobID; + if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0"); + } + + /** + * Create the EOF marker version of MapResult + */ + MapResult() { + super(); + this.jobID = Integer.MAX_VALUE; + } + + /** + * @return the job ID of the map job that produced this MapResult + */ + public int getJobID() { + return jobID; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java similarity index 84% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java index ddf4421d2..cc5335051 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerMapFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java @@ -9,7 +9,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface NanoSchedulerMapFunction { +public interface NSMapFunction { /** * Return function on input, returning a value of ResultType * @param input diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java similarity index 81% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java index 8631196a3..8b12c62c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerProgressFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java @@ -7,6 +7,6 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Time: 2:10 PM * To change this template use File | Settings | File Templates. */ -public interface NanoSchedulerProgressFunction { +public interface NSProgressFunction { public void progress(final InputType lastMapInput); } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java similarity index 87% rename from public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java rename to public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java index 7e58eeaf9..879a33a1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerReduceFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java @@ -7,7 +7,7 @@ package org.broadinstitute.sting.utils.nanoScheduler; * Date: 8/24/12 * Time: 9:49 AM */ -public interface NanoSchedulerReduceFunction { +public interface NSReduceFunction { /** * Combine one with sum into a new ReduceType * @param one the result of a map call on an input element diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index fe8731d3b..664fb7b9b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -17,12 +17,12 @@ import java.util.concurrent.*; * * The overall framework works like this * - * nano <- new Nanoschedule(bufferSize, numberOfMapElementsToProcessTogether, nThreads) + * nano <- new Nanoschedule(inputBufferSize, numberOfMapElementsToProcessTogether, nThreads) * List[Input] outerData : outerDataLoop ) * result = nano.execute(outerData.iterator(), map, reduce) * - * bufferSize determines how many elements from the input stream are read in one go by the - * nanoscheduler. The scheduler may hold up to bufferSize in memory at one time, as well + * inputBufferSize determines how many elements from the input stream are read in one go by the + * nanoscheduler. The scheduler may hold up to inputBufferSize in memory at one time, as well * as up to inputBufferSize map results as well. * * numberOfMapElementsToProcessTogether determines how many input elements are processed @@ -48,40 +48,45 @@ public class NanoScheduler { private final static boolean LOG_MAP_TIMES = false; private final static boolean TIME_CALLS = true; - final int bufferSize; - final int nThreads; + private final static int MAP_BUFFER_SIZE_SCALE_FACTOR = 100; + final int inputBufferSize; + final int mapBufferSize; + final int nThreads; final ExecutorService inputExecutor; final ExecutorService reduceExecutor; - final ExecutorService mapExecutor; + final ThreadPoolExecutor mapExecutor; + boolean shutdown = false; boolean debug = false; + private NSProgressFunction progressFunction = null; - private NanoSchedulerProgressFunction progressFunction = null; - - final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); - final SimpleTimer inputTimer = new SimpleTimer("input"); - final SimpleTimer mapTimer = new SimpleTimer("map"); - final SimpleTimer reduceTimer = new SimpleTimer("reduce"); + final SimpleTimer outsideSchedulerTimer = TIME_CALLS ? new SimpleTimer("outside") : null; + final SimpleTimer inputTimer = TIME_CALLS ? new SimpleTimer("input") : null; + final SimpleTimer mapTimer = TIME_CALLS ? new SimpleTimer("map") : null; + final SimpleTimer reduceTimer = TIME_CALLS ? new SimpleTimer("reduce") : null; /** - * Create a new nanoschedule with the desire characteristics requested by the argument + * Create a new nanoscheduler with the desire characteristics requested by the argument * - * @param bufferSize the number of input elements to read in each scheduling cycle. - * @param nThreads the number of threads to use to get work done, in addition to the thread calling execute + * @param inputBufferSize the number of input elements to read in each scheduling cycle. + * @param nThreads the number of threads to use to get work done, in addition to the + * thread calling execute */ - public NanoScheduler(final int bufferSize, - final int nThreads) { - if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); + public NanoScheduler(final int inputBufferSize, final int nThreads) { + if ( inputBufferSize < 1 ) throw new IllegalArgumentException("inputBufferSize must be >= 1, got " + inputBufferSize); if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); - this.bufferSize = bufferSize; + this.inputBufferSize = inputBufferSize; + this.mapBufferSize = inputBufferSize * MAP_BUFFER_SIZE_SCALE_FACTOR; this.nThreads = nThreads; if ( nThreads == 1 ) { - this.mapExecutor = this.inputExecutor = this.reduceExecutor = null; + this.mapExecutor = null; + this.inputExecutor = this.reduceExecutor = null; } else { - this.mapExecutor = Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.mapExecutor = (ThreadPoolExecutor)Executors.newFixedThreadPool(nThreads-1, new NamedThreadFactory("NS-map-thread-%d")); + this.mapExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy()); this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); this.reduceExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-reduce-thread-%d")); } @@ -104,8 +109,8 @@ public class NanoScheduler { * @return */ @Ensures("result > 0") - public int getBufferSize() { - return bufferSize; + public int getInputBufferSize() { + return inputBufferSize; } /** @@ -116,9 +121,11 @@ public class NanoScheduler { public void shutdown() { outsideSchedulerTimer.stop(); - shutdownExecutor("inputExecutor", inputExecutor); - shutdownExecutor("mapExecutor", mapExecutor); - shutdownExecutor("reduceExecutor", reduceExecutor); + if ( nThreads > 1 ) { + shutdownExecutor("inputExecutor", inputExecutor); + shutdownExecutor("mapExecutor", mapExecutor); + shutdownExecutor("reduceExecutor", reduceExecutor); + } shutdown = true; if (TIME_CALLS) { @@ -136,15 +143,15 @@ public class NanoScheduler { * @param name a string name for error messages for the executorService we are shutting down * @param executorService the executorService to shut down */ + @Requires({"name != null", "executorService != null"}) + @Ensures("executorService.isShutdown()") private void shutdownExecutor(final String name, final ExecutorService executorService) { - if ( executorService != null ) { - if ( executorService.isShutdown() || executorService.isTerminated() ) - throw new IllegalStateException("Executor service " + name + " is already shut down!"); + if ( executorService.isShutdown() || executorService.isTerminated() ) + throw new IllegalStateException("Executor service " + name + " is already shut down!"); - final List remaining = executorService.shutdownNow(); - if ( ! remaining.isEmpty() ) - throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); - } + final List remaining = executorService.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); } /** @@ -204,7 +211,7 @@ public class NanoScheduler { * * @param progressFunction a progress function to call, or null if you don't want any progress callback */ - public void setProgressFunction(final NanoSchedulerProgressFunction progressFunction) { + public void setProgressFunction(final NSProgressFunction progressFunction) { this.progressFunction = progressFunction; } @@ -231,9 +238,9 @@ public class NanoScheduler { * @return the last reduce value */ public ReduceType execute(final Iterator inputReader, - final NanoSchedulerMapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final NanoSchedulerReduceFunction reduce) { + final NSReduceFunction reduce) { if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); @@ -259,9 +266,9 @@ public class NanoScheduler { */ @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeSingleThreaded(final Iterator inputReader, - final NanoSchedulerMapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final NanoSchedulerReduceFunction reduce) { + final NSReduceFunction reduce) { ReduceType sum = initialValue; int i = 0; @@ -278,7 +285,7 @@ public class NanoScheduler { if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (mapTimer.currentTimeNano() - preMapTime)); if ( TIME_CALLS ) mapTimer.stop(); - if ( i++ % bufferSize == 0 && progressFunction != null ) + if ( i++ % inputBufferSize == 0 && progressFunction != null ) progressFunction.progress(input); // reduce @@ -299,55 +306,53 @@ public class NanoScheduler { */ @Requires({"inputReader != null", "map != null", "reduce != null"}) private ReduceType executeMultiThreaded(final Iterator inputReader, - final NanoSchedulerMapFunction map, + final NSMapFunction map, final ReduceType initialValue, - final NanoSchedulerReduceFunction reduce) { + final NSReduceFunction reduce) { debugPrint("Executing nanoScheduler"); - // a completion service that tracks when jobs complete, so we can wait in this thread - // until all of the map jobs are completed, without having to shut down the executor itself - final ExecutorCompletionService mapJobCompletionService = - new ExecutorCompletionService(mapExecutor); - // a blocking queue that limits the number of input datum to the requested buffer size - final BlockingQueue inputQueue = new LinkedBlockingDeque(bufferSize); + final BlockingQueue.InputValue> inputQueue + = new LinkedBlockingDeque.InputValue>(inputBufferSize); - // a priority queue that stores up to bufferSize * MAP_QUEUE_SCALE_FACTOR elements + // a priority queue that stores up to mapBufferSize elements // produced by completed map jobs. - final PriorityBlockingQueue mapResultQueue = new PriorityBlockingQueue(bufferSize*100); + final BlockingQueue>> mapResultQueue = + new LinkedBlockingDeque>>(mapBufferSize); - // TODO -- the logic of this blocking queue is wrong! We need to wait for map jobs in order, not just - // -- in the order in which they are produced + // Start running the input reader thread + inputExecutor.submit(new InputProducer(inputReader, inputTimer, inputQueue)); - // TODO -- map executor must have fixed size map jobs queue - - inputExecutor.submit(new InputProducer(inputReader, inputQueue)); - final Future reduceResult = reduceExecutor.submit(new ReducerThread(reduce, initialValue, mapResultQueue)); + // Start running the reducer thread + final ReducerThread reducer + = new ReducerThread(reduce, reduceTimer, initialValue, mapResultQueue); + final Future reduceResult = reduceExecutor.submit(reducer); try { int numJobs = 0; + while ( true ) { // block on input - final InputDatum inputEnqueueWrapped = inputQueue.take(); + final InputProducer.InputValue inputEnqueueWrapped = inputQueue.take(); if ( ! inputEnqueueWrapped.isLast() ) { // get the object itself - final InputType input = inputEnqueueWrapped.datum; + final InputType input = inputEnqueueWrapped.getValue(); - // the next map call has id + 1 + // the next map call has jobID + 1 numJobs++; // send job for map via the completion service - final CallableMap doMap = new CallableMap(map, numJobs, input, mapResultQueue); - mapJobCompletionService.submit(doMap, numJobs); + final CallableMap doMap = new CallableMap(map, numJobs, input); + final Future> mapJob = mapExecutor.submit(doMap); + mapResultQueue.put(mapJob); debugPrint(" Done with cycle of map/reduce"); - if ( progressFunction != null ) // TODO -- don't cycle so often + if ( numJobs % inputBufferSize == 0 && progressFunction != null ) progressFunction.progress(input); } else { - waitForLastJob(mapJobCompletionService, numJobs); - mapResultQueue.add(new MapResult()); + mapResultQueue.put(new FutureValue>(new MapResult())); return reduceResult.get(); // wait for our result of reduce } } @@ -358,147 +363,30 @@ public class NanoScheduler { } } - /** - * Helper routine that will wait until the last map job finishes running - * by taking numJob values from the executor completion service, using - * the blocking take() call. - */ - private void waitForLastJob(final ExecutorCompletionService mapJobCompletionService, - final int numJobs ) throws InterruptedException { - for ( int i = 0; i < numJobs; i++ ) - mapJobCompletionService.take(); - } - - private class ReducerThread implements Callable { - final NanoSchedulerReduceFunction reduce; - ReduceType sum; - final PriorityBlockingQueue mapResultQueue; - - public ReducerThread(final NanoSchedulerReduceFunction reduce, - final ReduceType sum, - final PriorityBlockingQueue mapResultQueue) { - this.reduce = reduce; - this.sum = sum; - this.mapResultQueue = mapResultQueue; - } - - public ReduceType call() { - try { - while ( true ) { - final MapResult result = mapResultQueue.take(); - //System.out.println("Reduce of map result " + result.id + " with sum " + sum); - if ( result.isLast() ) { - //System.out.println("Saw last! " + result.id); - return sum; - } - else { - if ( TIME_CALLS ) reduceTimer.restart(); - sum = reduce.apply(result.datum, sum); - if ( TIME_CALLS ) reduceTimer.stop(); - } - } - } catch (InterruptedException ex) { - //System.out.println("Interrupted"); - throw new ReviewedStingException("got execution exception", ex); - } - } - } - - private class InputProducer implements Runnable { - final Iterator inputReader; - final BlockingQueue outputQueue; - - public InputProducer(final Iterator inputReader, final BlockingQueue outputQueue) { - this.inputReader = inputReader; - this.outputQueue = outputQueue; - } - - public void run() { - try { - while ( inputReader.hasNext() ) { - if ( TIME_CALLS ) inputTimer.restart(); - final InputType input = inputReader.next(); - if ( TIME_CALLS ) inputTimer.stop(); - outputQueue.put(new InputDatum(input)); - } - - // add the EOF object so we know we are done - outputQueue.put(new InputDatum()); - } catch (InterruptedException ex) { - throw new ReviewedStingException("got execution exception", ex); - } - } - } - - private class BlockingDatum { - final boolean isLast; - final T datum; - - private BlockingDatum(final T datum) { - isLast = false; - this.datum = datum; - } - - private BlockingDatum() { - isLast = true; - this.datum = null; - } - - public boolean isLast() { - return isLast; - } - } - - - private class InputDatum extends BlockingDatum { - private InputDatum(InputType datum) { super(datum); } - private InputDatum() { } - } - - private class MapResult extends BlockingDatum implements Comparable { - final Integer id; - - private MapResult(MapType datum, Integer id) { - super(datum); - this.id = id; - } - - private MapResult() { - this.id = Integer.MAX_VALUE; - } - - @Override - public int compareTo(MapResult o) { - return id.compareTo(o.id); - } - } - /** * A simple callable version of the map function for use with the executor pool */ - private class CallableMap implements Runnable { + private class CallableMap implements Callable> { final int id; final InputType input; - final NanoSchedulerMapFunction map; - final PriorityBlockingQueue mapResultQueue; + final NSMapFunction map; @Requires({"map != null"}) - private CallableMap(final NanoSchedulerMapFunction map, + private CallableMap(final NSMapFunction map, final int id, - final InputType input, - final PriorityBlockingQueue mapResultQueue) { + final InputType input) { this.id = id; this.input = input; this.map = map; - this.mapResultQueue = mapResultQueue; } - @Override public void run() { + @Override + public MapResult call() { if ( TIME_CALLS ) mapTimer.restart(); if ( debug ) debugPrint("\t\tmap " + input); final MapType result = map.apply(input); if ( TIME_CALLS ) mapTimer.stop(); - mapResultQueue.add(new MapResult(result, id)); + return new MapResult(result, id); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java new file mode 100644 index 000000000..bd29799b6 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +/** + * Thread that runs the reduce of the map/reduce. + * + * This thread reads from mapResultsQueue until the poison EOF object arrives. At each + * stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the + * queue waits until the mapResultQueue has a value to take. Then, it gets and waits + * until the map result Future has a value. + */ +class ReducerThread implements Callable { + final NSReduceFunction reduce; + final SimpleTimer reduceTimer; + final BlockingQueue>> mapResultQueue; + + ReduceType sum; + int lastJobID = -1; + + public ReducerThread(final NSReduceFunction reduce, + final SimpleTimer reduceTimer, + final ReduceType sum, + final BlockingQueue>> mapResultQueue) { + if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); + if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); + + this.reduce = reduce; + this.reduceTimer = reduceTimer; + this.sum = sum; + this.mapResultQueue = mapResultQueue; + } + + public ReduceType call() { + try { + while ( true ) { + final MapResult result = mapResultQueue.take().get(); + if ( result.isLast() ) { + // we are done, just return sum + return sum; + } + else if ( result.getJobID() < lastJobID ) { + // make sure the map results are coming in order + throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID()); + } else { + // apply reduce, keeping track of sum + if ( reduceTimer != null ) reduceTimer.restart(); + sum = reduce.apply(result.getValue(), sum); + if ( reduceTimer != null ) reduceTimer.stop(); + } + } + } catch (ExecutionException ex) { + throw new ReviewedStingException("got execution exception", ex); + } catch (InterruptedException ex) { + throw new ReviewedStingException("got execution exception", ex); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 21ac6dcec..47dcc1d5e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -22,11 +22,11 @@ import java.util.List; public class NanoSchedulerUnitTest extends BaseTest { public static final int NANO_SCHEDULE_MAX_RUNTIME = 60000; - private static class Map2x implements NanoSchedulerMapFunction { + private static class Map2x implements NSMapFunction { @Override public Integer apply(Integer input) { return input * 2; } } - private static class ReduceSum implements NanoSchedulerReduceFunction { + private static class ReduceSum implements NSReduceFunction { int prevOne = Integer.MIN_VALUE; @Override public Integer apply(Integer one, Integer sum) { @@ -35,7 +35,7 @@ public class NanoSchedulerUnitTest extends BaseTest { } } - private static class ProgressCallback implements NanoSchedulerProgressFunction { + private static class ProgressCallback implements NSProgressFunction { int callBacks = 0; @Override @@ -120,7 +120,7 @@ public class NanoSchedulerUnitTest extends BaseTest { final ProgressCallback callback = new ProgressCallback(); nanoScheduler.setProgressFunction(callback); - Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); + Assert.assertEquals(nanoScheduler.getInputBufferSize(), test.bufferSize, "inputBufferSize argument"); Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); From 8c0e3b1e0cd8c21e473543337c1c8b91fff44f2f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 09:12:47 -0400 Subject: [PATCH 41/45] UnitTests for InputProducer --- .../nanoScheduler/InputProducerUnitTest.java | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java new file mode 100644 index 000000000..0973db8a3 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -0,0 +1,71 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; + +/** + * UnitTests for the NanoScheduler + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class InputProducerUnitTest extends BaseTest { + @DataProvider(name = "InputProducerTest") + public Object[][] createInputProducerTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + for ( final int queueSize : Arrays.asList(1, 10, 100) ) { + tests.add(new Object[]{ nElements, queueSize }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testSingleThreadedNanoScheduler(final int nElements, final int queueSize) throws InterruptedException { + final List elements = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) elements.add(i); + + final LinkedBlockingDeque.InputValue> readQueue = + new LinkedBlockingDeque.InputValue>(queueSize); + + final InputProducer ip = new InputProducer(elements.iterator(), null, readQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + es.submit(ip); + + int lastValue = -1; + int nRead = 0; + while ( true ) { + final int observedQueueSize = readQueue.size(); + Assert.assertTrue(observedQueueSize <= queueSize, + "Reader is enqueuing more elements " + queueSize + " than allowed " + queueSize); + + final InputProducer.InputValue value = readQueue.take(); + if ( value.isLast() ) { + Assert.assertEquals(nRead, nElements, "Number of input values " + nRead + " not all that are expected " + nElements); + Assert.assertEquals(readQueue.size(), 0, "Last queue element found but queue contains more values!"); + break; + } else { + Assert.assertTrue(lastValue < value.getValue(), "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)value.getValue(), expected, "Value observed " + value.getValue() + " not equal to the expected value " + expected); + nRead++; + lastValue = value.getValue(); + } + } + } +} From bf87de8a252bc566d820cf85cfe7dcc745d8e679 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 09:51:32 -0400 Subject: [PATCH 42/45] UnitTests for ReducerThread and InputProducer -- Uncovered bug in ReducerThread in detecting abnormal case where jobs are coming in out of order --- .../utils/nanoScheduler/ReducerThread.java | 1 + .../nanoScheduler/InputProducerUnitTest.java | 6 +- .../nanoScheduler/ReducerThreadUnitTest.java | 94 +++++++++++++++++++ 3 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java index bd29799b6..506e45453 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/ReducerThread.java @@ -49,6 +49,7 @@ class ReducerThread implements Callable { // make sure the map results are coming in order throw new IllegalStateException("BUG: last jobID " + lastJobID + " > current jobID " + result.getJobID()); } else { + lastJobID = result.getJobID(); // apply reduce, keeping track of sum if ( reduceTimer != null ) reduceTimer.restart(); sum = reduce.apply(result.getValue(), sum); diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java index 0973db8a3..b3365c13c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -13,7 +13,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingDeque; /** - * UnitTests for the NanoScheduler + * UnitTests for the InputProducer * * User: depristo * Date: 8/24/12 @@ -35,7 +35,7 @@ public class InputProducerUnitTest extends BaseTest { } @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) - public void testSingleThreadedNanoScheduler(final int nElements, final int queueSize) throws InterruptedException { + public void testInputProducer(final int nElements, final int queueSize) throws InterruptedException { final List elements = new ArrayList(nElements); for ( int i = 0; i < nElements; i++ ) elements.add(i); @@ -52,7 +52,7 @@ public class InputProducerUnitTest extends BaseTest { while ( true ) { final int observedQueueSize = readQueue.size(); Assert.assertTrue(observedQueueSize <= queueSize, - "Reader is enqueuing more elements " + queueSize + " than allowed " + queueSize); + "Reader is enqueuing more elements " + observedQueueSize + " than allowed " + queueSize); final InputProducer.InputValue value = readQueue.take(); if ( value.isLast() ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java new file mode 100644 index 000000000..61d1330bc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerThreadUnitTest.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.*; + +/** + * UnitTests for the InputProducer + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ReducerThreadUnitTest extends BaseTest { + @DataProvider(name = "ReducerThreadTest") + public Object[][] createReducerThreadTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + tests.add(new Object[]{ nElements }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "ReducerThreadTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testReducerThreadTest(final int nElements) throws Exception { + List values = new ArrayList(nElements); + List jobIDs = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) { + values.add(i); + jobIDs.add(i); + } + + runTests(values, jobIDs); + } + + @Test(enabled = true, timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME, expectedExceptions = ExecutionException.class) + public void testReducerThreadTestByJobOrder() throws Exception { + runTests(Arrays.asList(0, 1, 2), Arrays.asList(1, 3, 2)); + } + + private void runTests( final List mapValues, final List jobIDs) throws Exception { + final LinkedBlockingDeque>> mapResultsQueue = + new LinkedBlockingDeque>>(mapValues.size()+1); + + for ( int i = 0; i < mapValues.size(); i++ ) { + final int value = mapValues.get(i); + final int jobID = jobIDs.get(i); + final MapResult mapResult = new MapResult(value, jobID); + mapResultsQueue.add(new FutureValue>(mapResult)); + } + mapResultsQueue.add(new FutureValue>(new MapResult())); + + final ReduceSumTest reduce = new ReduceSumTest(mapResultsQueue); + final ReducerThread thread + = new ReducerThread(reduce, null, 0, mapResultsQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + final Future value = es.submit(thread); + value.get(); + + Assert.assertEquals(reduce.nRead, mapValues.size()); + } + + public class ReduceSumTest implements NSReduceFunction { + final LinkedBlockingDeque>> mapResultsQueue; + int nRead = 0; + int lastValue = -1; + + public ReduceSumTest(LinkedBlockingDeque>> mapResultsQueue) { + this.mapResultsQueue = mapResultsQueue; + } + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(lastValue < one, "Reduce came in out of order. Prev " + lastValue + " cur " + one); + + Assert.assertTrue(lastValue < one, "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)one, expected, "Value observed " + one + " not equal to the expected value " + expected); + nRead++; + lastValue = expected; + + return one + sum; + } + } +} From d62eca5d92bc0761b7824eedc74186cd12e25744 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 10:47:29 -0400 Subject: [PATCH 44/45] Update GATKPerformanceOverTime to measure -nt and -nct --- .../sting/gatk/executive/MicroScheduler.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 46d6b5882..c6ef9acf1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -102,9 +102,15 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { * @return The best-fit microscheduler. */ public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) { - if ( threadAllocation.isRunningInParallelMode() ) + if ( threadAllocation.isRunningInParallelMode() ) { + // TODO -- remove me when we fix running NCT within HMS + if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1) + throw new UserException("Currently the GATK does not support running CPU threads within data threads, " + + "please specify only one of NT and NCT"); + logger.info(String.format("Running the GATK in parallel mode with %d CPU thread(s) for each of %d data thread(s)", threadAllocation.getNumCPUThreadsPerDataThread(), threadAllocation.getNumDataThreads())); + } if ( threadAllocation.getNumDataThreads() > 1 ) { if (walker.isReduceByInterval()) From f25bf0f927ea7f36662bfbe0756c6f1c6204581a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 7 Sep 2012 11:03:00 -0400 Subject: [PATCH 45/45] EfficiencyMonitoringThreadFactoryUnitTests thing keeps timing out unnecessary --- .../threading/EfficiencyMonitoringThreadFactoryUnitTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index d8da274ce..7381bebc4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -44,7 +44,7 @@ import java.util.concurrent.TimeUnit; */ public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 10000; + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100000; private final static int MAX_THREADS = 4; final static Object GLOBAL_LOCK = new Object();